diff -ur ../vger3-990605/linux/drivers/net/myri_sbus.c linux/drivers/net/myri_sbus.c --- ../vger3-990605/linux/drivers/net/myri_sbus.c Mon Apr 5 19:23:03 1999 +++ linux/drivers/net/myri_sbus.c Sat Jun 5 19:21:23 1999 @@ -756,6 +756,7 @@ eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, dev->addr_len); memcpy(eth->h_dest, neigh->ha, dev->addr_len); + hh->hh_len = 16; return 0; } diff -ur ../vger3-990605/linux/include/linux/igmp.h linux/include/linux/igmp.h --- ../vger3-990605/linux/include/linux/igmp.h Sun Dec 14 00:52:05 1997 +++ linux/include/linux/igmp.h Sat Jun 5 21:35:58 1999 @@ -101,19 +101,7 @@ char loaded; }; -extern __inline__ int ip_check_mc(struct device *dev, u32 mc_addr) -{ - struct in_device *in_dev = dev->ip_ptr; - struct ip_mc_list *im; - - if (in_dev) { - for (im=in_dev->mc_list; im; im=im->next) - if (im->multiaddr == mc_addr) - return 1; - } - return 0; -} - +extern int ip_check_mc(struct device *dev, u32 mc_addr); extern int igmp_rcv(struct sk_buff *, unsigned short); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); diff -ur ../vger3-990605/linux/include/linux/inet.h linux/include/linux/inet.h --- ../vger3-990605/linux/include/linux/inet.h Tue May 13 11:44:57 1997 +++ linux/include/linux/inet.h Sat Jun 5 20:32:52 1999 @@ -46,6 +46,7 @@ extern void inet_proto_init(struct net_proto *pro); extern char *in_ntoa(__u32 in); +extern char *in_ntoa2(__u32 in, char *buf); extern __u32 in_aton(const char *str); #endif diff -ur ../vger3-990605/linux/include/linux/netdevice.h linux/include/linux/netdevice.h --- ../vger3-990605/linux/include/linux/netdevice.h Fri May 28 19:49:50 1999 +++ linux/include/linux/netdevice.h Sun Jun 6 23:18:37 1999 @@ -152,6 +152,7 @@ struct hh_cache *hh_next; /* Next entry */ atomic_t hh_refcnt; /* number of users */ unsigned short hh_type; /* protocol identifier, f.e ETH_P_IP */ + int hh_len; /* length of header */ int (*hh_output)(struct sk_buff *skb); rwlock_t hh_lock; /* cached hardware header; allow for machine alignment needs. */ @@ -260,6 +261,7 @@ void *atalk_ptr; /* AppleTalk link */ void *ip_ptr; /* IPv4 specific data */ void *dn_ptr; /* DECnet specific data */ + void *ip6_ptr; /* IPv6 specific data */ struct Qdisc *qdisc; struct Qdisc *qdisc_sleeping; @@ -268,6 +270,13 @@ /* hard_start_xmit synchronizer */ spinlock_t xmit_lock; + /* cpu id of processor entered to hard_start_xmit or -1, + if nobody entered there. + */ + int xmit_lock_owner; + /* device queue lock */ + spinlock_t queue_lock; + atomic_t refcnt; /* Pointers to interface service routines. */ int (*open)(struct device *dev); diff -ur ../vger3-990605/linux/include/linux/pkt_sched.h linux/include/linux/pkt_sched.h --- ../vger3-990605/linux/include/linux/pkt_sched.h Tue Apr 28 17:47:22 1998 +++ linux/include/linux/pkt_sched.h Sun Jun 6 20:29:13 1999 @@ -38,6 +38,9 @@ __u32 pps; /* Current flow packet rate */ __u32 qlen; __u32 backlog; +#ifdef __KERNEL__ + spinlock_t *lock; +#endif }; struct tc_estimator diff -ur ../vger3-990605/linux/include/linux/rtnetlink.h linux/include/linux/rtnetlink.h --- ../vger3-990605/linux/include/linux/rtnetlink.h Mon May 31 17:56:03 1999 +++ linux/include/linux/rtnetlink.h Sat Jun 5 22:33:08 1999 @@ -515,9 +515,6 @@ #ifdef __KERNEL__ -extern atomic_t rtnl_rlockct; -extern wait_queue_head_t rtnl_wait; - extern __inline__ int rtattr_strcmp(struct rtattr *rta, char *str) { int len = strlen(str) + 1; @@ -544,127 +541,31 @@ #define RTA_PUT(skb, attrtype, attrlen, data) \ ({ if (skb_tailroom(skb) < (int)RTA_SPACE(attrlen)) goto rtattr_failure; \ __rta_fill(skb, attrtype, attrlen, data); }) - -extern unsigned long rtnl_wlockct; - -/* NOTE: these locks are not interrupt safe, are not SMP safe, - * they are even not atomic. 8)8)8) ... and it is not a bug. - * Really, if these locks will be programmed correctly, - * all the addressing/routing machine would become SMP safe, - * but is absolutely useless at the moment, because all the kernel - * is not reenterable in any case. --ANK - * - * Well, atomic_* and set_bit provide the only thing here: - * gcc is confused not to overoptimize them, that's all. - * I remember as gcc splitted ++ operation, but cannot reproduce - * it with gcc-2.7.*. --ANK - * - * One more note: rwlock facility should be written and put - * to a kernel wide location: f.e. current implementation of semaphores - * (especially, for x86) looks like a wonder. It would be good - * to have something similar for rwlock. Recursive lock could be also - * useful thing. --ANK - */ - -extern __inline__ int rtnl_shlock_nowait(void) -{ - atomic_inc(&rtnl_rlockct); - if (test_bit(0, &rtnl_wlockct)) { - atomic_dec(&rtnl_rlockct); - return -EAGAIN; - } - return 0; -} - -extern __inline__ void rtnl_shlock(void) -{ - while (rtnl_shlock_nowait()) - sleep_on(&rtnl_wait); -} - -/* Check for possibility to PROMOTE shared lock to exclusive. - Shared lock must be already grabbed with rtnl_shlock*(). - */ - -extern __inline__ int rtnl_exlock_nowait(void) -{ - if (atomic_read(&rtnl_rlockct) > 1) - return -EAGAIN; - if (test_and_set_bit(0, &rtnl_wlockct)) - return -EAGAIN; - return 0; -} - -extern __inline__ void rtnl_exlock(void) -{ - while (rtnl_exlock_nowait()) - sleep_on(&rtnl_wait); -} - -#if 0 -extern __inline__ void rtnl_shunlock(void) -{ - atomic_dec(&rtnl_rlockct); - if (atomic_read(&rtnl_rlockct) <= 1) { - wake_up(&rtnl_wait); - if (rtnl && rtnl->receive_queue.qlen) - rtnl->data_ready(rtnl, 0); - } -} -#else - -/* The problem: inline requires to include and, hence, - almost all of net includes :-( - */ - -#define rtnl_shunlock() ({ \ - atomic_dec(&rtnl_rlockct); \ - if (atomic_read(&rtnl_rlockct) <= 1) { \ - wake_up(&rtnl_wait); \ - if (rtnl && rtnl->receive_queue.qlen) \ - rtnl->data_ready(rtnl, 0); \ - } \ -}) #endif -/* Release exclusive lock. Note, that we do not wake up rtnetlink socket, - * it will be done later after releasing shared lock. - */ - -extern __inline__ void rtnl_exunlock(void) -{ - clear_bit(0, &rtnl_wlockct); - wake_up(&rtnl_wait); -} - -#else +extern struct semaphore rtnl_sem; -extern __inline__ void rtnl_shlock(void) -{ - while (atomic_read(&rtnl_rlockct)) - sleep_on(&rtnl_wait); - atomic_inc(&rtnl_rlockct); -} +#define rtnl_exlock() do { } while(0) +#define rtnl_exunlock() do { } while(0) +#define rtnl_exlock_nowait() (0) -extern __inline__ void rtnl_shunlock(void) -{ - if (atomic_dec_and_test(&rtnl_rlockct)) - wake_up(&rtnl_wait); -} - -extern __inline__ void rtnl_exlock(void) -{ -} - -extern __inline__ void rtnl_exunlock(void) -{ -} +#define rtnl_shlock() down(&rtnl_sem) +#define rtnl_shlock_nowait() down_trylock(&rtnl_sem) +#ifndef CONFIG_RTNETLINK +#define rtnl_shunlock() up(&rtnl_sem) +#else +#define rtnl_shunlock() do { up(&rtnl_sem); \ + if (rtnl && rtnl->receive_queue.qlen) \ + rtnl->data_ready(rtnl, 0); \ + } while(0) #endif extern void rtnl_lock(void); extern void rtnl_unlock(void); extern void rtnetlink_init(void); + + #endif /* __KERNEL__ */ diff -ur ../vger3-990605/linux/include/net/addrconf.h linux/include/net/addrconf.h --- ../vger3-990605/linux/include/net/addrconf.h Sun Mar 21 17:28:11 1999 +++ linux/include/net/addrconf.h Sat Jun 5 21:03:22 1999 @@ -56,7 +56,7 @@ extern int ipv6_get_saddr(struct dst_entry *dst, struct in6_addr *daddr, struct in6_addr *saddr); -extern struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev); +extern int ipv6_get_lladdr(struct device *dev, struct in6_addr *); /* * multicast prototypes (mcast.c) @@ -68,6 +68,7 @@ int ifindex, struct in6_addr *addr); extern void ipv6_sock_mc_close(struct sock *sk); +extern int inet6_mc_check(struct sock *sk, struct in6_addr *addr); extern int ipv6_dev_mc_inc(struct device *dev, struct in6_addr *addr); diff -ur ../vger3-990605/linux/include/net/dst.h linux/include/net/dst.h --- ../vger3-990605/linux/include/net/dst.h Fri May 28 19:50:53 1999 +++ linux/include/net/dst.h Sun Jun 6 23:18:48 1999 @@ -170,6 +170,9 @@ if (dst->expires == 0 || (long)(dst->expires - expires) > 0) dst->expires = expires; } + +extern void dst_init(void); + #endif #endif /* _NET_DST_H */ diff -ur ../vger3-990605/linux/include/net/if_inet6.h linux/include/net/if_inet6.h --- ../vger3-990605/linux/include/net/if_inet6.h Sun Mar 8 08:55:16 1998 +++ linux/include/net/if_inet6.h Sat Jun 5 20:32:51 1999 @@ -44,6 +44,7 @@ __u32 valid_lft; __u32 prefered_lft; unsigned long tstamp; + atomic_t refcnt; __u8 probes; __u8 flags; @@ -108,6 +109,7 @@ struct inet6_ifaddr *addr_list; struct ifmcaddr6 *mc_list; + rwlock_t lock; __u32 if_flags; struct neigh_parms *nd_parms; diff -ur ../vger3-990605/linux/include/net/ip.h linux/include/net/ip.h --- ../vger3-990605/linux/include/net/ip.h Mon May 31 17:56:24 1999 +++ linux/include/net/ip.h Sun Jun 6 23:18:54 1999 @@ -146,10 +146,10 @@ skb->protocol = __constant_htons(ETH_P_IP); if (hh) { - read_lock_irq(&hh->hh_lock); + read_lock_bh(&hh->hh_lock); memcpy(skb->data - 16, hh->hh_data, 16); - read_unlock_irq(&hh->hh_lock); - skb_push(skb, dev->hard_header_len); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); return hh->hh_output(skb); } else if (dst->neighbour) return dst->neighbour->output(skb); diff -ur ../vger3-990605/linux/include/net/neighbour.h linux/include/net/neighbour.h --- ../vger3-990605/linux/include/net/neighbour.h Sun Mar 21 17:28:19 1999 +++ linux/include/net/neighbour.h Sun Jun 6 23:18:43 1999 @@ -96,7 +96,8 @@ __u8 flags; __u8 nud_state; __u8 type; - __u8 probes; + atomic_t probes; + rwlock_t lock; unsigned char ha[MAX_ADDR_LEN]; struct hh_cache *hh; atomic_t refcnt; @@ -155,7 +156,7 @@ struct timer_list proxy_timer; struct sk_buff_head proxy_queue; int entries; - atomic_t lock; + rwlock_t lock; unsigned long last_rand; struct neigh_parms *parms_list; struct neigh_statistics stats; @@ -165,9 +166,12 @@ extern void neigh_table_init(struct neigh_table *tbl); extern int neigh_table_clear(struct neigh_table *tbl); -extern struct neighbour *__neigh_lookup(struct neigh_table *tbl, - const void *pkey, struct device *dev, - int creat); +extern struct neighbour * neigh_lookup(struct neigh_table *tbl, + const void *pkey, + struct device *dev); +extern struct neighbour * neigh_create(struct neigh_table *tbl, + const void *pkey, + struct device *dev); extern void neigh_destroy(struct neighbour *neigh); extern int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); extern int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp); @@ -226,16 +230,6 @@ neigh->confirmed = jiffies; } -extern __inline__ struct neighbour * -neigh_lookup(struct neigh_table *tbl, const void *pkey, struct device *dev) -{ - struct neighbour *neigh; - start_bh_atomic(); - neigh = __neigh_lookup(tbl, pkey, dev, 0); - end_bh_atomic(); - return neigh; -} - extern __inline__ int neigh_is_connected(struct neighbour *neigh) { return neigh->nud_state&NUD_CONNECTED; @@ -254,17 +248,16 @@ return 0; } -extern __inline__ void neigh_table_lock(struct neigh_table *tbl) +extern __inline__ struct neighbour * +__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct device *dev, int creat) { - atomic_inc(&tbl->lock); - synchronize_bh(); -} + struct neighbour *n = neigh_lookup(tbl, pkey, dev); -extern __inline__ void neigh_table_unlock(struct neigh_table *tbl) -{ - atomic_dec(&tbl->lock); -} + if (n || !creat) + return n; + return neigh_create(tbl, pkey, dev); +} #endif #endif diff -ur ../vger3-990605/linux/include/net/pkt_cls.h linux/include/net/pkt_cls.h --- ../vger3-990605/linux/include/net/pkt_cls.h Mon Apr 5 19:34:14 1999 +++ linux/include/net/pkt_cls.h Sun Jun 6 23:12:33 1999 @@ -77,14 +77,11 @@ return -1; } -extern __inline__ unsigned long cls_set_class(unsigned long *clp, unsigned long cl) -{ - cl = xchg(clp, cl); - synchronize_bh(); - return cl; -} + extern int register_tcf_proto_ops(struct tcf_proto_ops *ops); extern int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); + + #endif diff -ur ../vger3-990605/linux/include/net/pkt_sched.h linux/include/net/pkt_sched.h --- ../vger3-990605/linux/include/net/pkt_sched.h Sat May 1 16:28:20 1999 +++ linux/include/net/pkt_sched.h Sun Jun 6 23:12:34 1999 @@ -66,9 +66,12 @@ struct Qdisc_head { struct Qdisc_head *forw; + struct Qdisc_head *back; }; extern struct Qdisc_head qdisc_head; +extern spinlock_t qdisc_runqueue_lock; +extern rwlock_t qdisc_tree_lock; struct Qdisc { @@ -106,6 +109,46 @@ int refcnt; }; +extern __inline__ void sch_tree_lock(struct Qdisc *q) +{ + write_lock(&qdisc_tree_lock); + spin_lock_bh(&q->dev->queue_lock); +} + +extern __inline__ void sch_tree_unlock(struct Qdisc *q) +{ + spin_unlock_bh(&q->dev->queue_lock); + write_unlock(&qdisc_tree_lock); +} + +extern __inline__ void tcf_tree_lock(struct tcf_proto *tp) +{ + write_lock(&qdisc_tree_lock); + spin_lock_bh(&tp->q->dev->queue_lock); +} + +extern __inline__ void tcf_tree_unlock(struct tcf_proto *tp) +{ + spin_unlock_bh(&tp->q->dev->queue_lock); + write_unlock(&qdisc_tree_lock); +} + + +extern __inline__ unsigned long +cls_set_class(struct tcf_proto *tp, unsigned long *clp, unsigned long cl) +{ + tcf_tree_lock(tp); + cl = xchg(clp, cl); + tcf_tree_unlock(tp); + return cl; +} + +extern __inline__ unsigned long +__cls_set_class(unsigned long *clp, unsigned long cl) +{ + return xchg(clp, cl); +} + /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth @@ -343,12 +386,14 @@ u32 toks; u32 ptoks; psched_time_t t_c; + spinlock_t lock; struct qdisc_rate_table *R_tab; struct qdisc_rate_table *P_tab; struct tc_stats stats; }; +extern int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st); extern void tcf_police_destroy(struct tcf_police *p); extern struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est); extern int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p); @@ -384,20 +429,56 @@ int tc_filter_init(void); int pktsched_init(void); -void qdisc_run_queues(void); -int qdisc_restart(struct device *dev); +extern void qdisc_run_queues(void); +extern int qdisc_restart(struct device *dev); + +extern spinlock_t qdisc_runqueue_lock; + +/* Is it on run list? Reliable only under qdisc_runqueue_lock. */ + +extern __inline__ int qdisc_on_runqueue(struct Qdisc *q) +{ + return q->h.forw != NULL; +} + +/* Is run list not empty? Reliable only under qdisc_runqueue_lock. */ + +extern __inline__ int qdisc_pending(void) +{ + return qdisc_head.forw != &qdisc_head; +} + +/* Add qdisc to tail of run list. Called with BH, disabled on this CPU */ + +extern __inline__ void qdisc_run(struct Qdisc *q) +{ + spin_lock(&qdisc_runqueue_lock); + if (!qdisc_on_runqueue(q)) { + q->h.forw = &qdisc_head; + q->h.back = qdisc_head.back; + qdisc_head.back->forw = &q->h; + qdisc_head.back = &q->h; + } + spin_unlock(&qdisc_runqueue_lock); +} + +/* If the device is not throttled, restart it and add to run list. + * BH must be disabled on this CPU. + */ extern __inline__ void qdisc_wakeup(struct device *dev) { if (!dev->tbusy) { - struct Qdisc *q = dev->qdisc; - if (qdisc_restart(dev) && q->h.forw == NULL) { - q->h.forw = qdisc_head.forw; - qdisc_head.forw = &q->h; - } + spin_lock(&dev->queue_lock); + if (qdisc_restart(dev)) + qdisc_run(dev->qdisc); + spin_unlock(&dev->queue_lock); } } +/* Calculate maximal size of packet seen by hard_start_xmit + routine of this device. + */ extern __inline__ unsigned psched_mtu(struct device *dev) { unsigned mtu = dev->mtu; diff -ur ../vger3-990605/linux/include/net/route.h linux/include/net/route.h --- ../vger3-990605/linux/include/net/route.h Fri May 28 19:50:58 1999 +++ linux/include/net/route.h Sun Jun 6 23:18:54 1999 @@ -35,13 +35,6 @@ #define RT_HASH_DIVISOR 256 -/* - * Prevents LRU trashing, entries considered equivalent, - * if the difference between last use times is less then this number. - */ -#define RT_CACHE_BUBBLE_THRESHOLD (5*HZ) - - #define RTO_ONLINK 0x01 #define RTO_TPROXY 0x80000000 @@ -103,6 +96,7 @@ }; extern struct ip_rt_acct ip_rt_acct[256]; +extern rwlock_t ip_rt_acct_lock; extern void ip_rt_init(void); extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw, diff -ur ../vger3-990605/linux/net/core/dev.c linux/net/core/dev.c --- ../vger3-990605/linux/net/core/dev.c Tue Jun 1 18:05:11 1999 +++ linux/net/core/dev.c Sun Jun 6 18:45:08 1999 @@ -263,13 +263,13 @@ { struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if (strcmp(dev->name, name) == 0) goto out; } out: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return dev; } @@ -277,13 +277,13 @@ { struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->ifindex == ifindex) goto out; } out: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return dev; } @@ -291,14 +291,14 @@ { struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->type == type && memcmp(dev->dev_addr, ha, dev->addr_len) == 0) goto out; } out: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return dev; } @@ -321,7 +321,7 @@ } return -ENFILE; /* Over 100 of the things .. bail out! */ } - + struct device *dev_alloc(const char *name, int *err) { struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL); @@ -387,9 +387,6 @@ if (dev->flags&IFF_UP) return 0; - /* Setup the lock before we open the faucet. */ - spin_lock_init(&dev->xmit_lock); - /* * Call device private open method */ @@ -452,10 +449,10 @@ if (dev) { dev_do_clear_fastroute(dev); } else { - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) dev_do_clear_fastroute(dev); - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); } } #endif @@ -596,59 +593,61 @@ struct device *dev = skb->dev; struct Qdisc *q; -#ifdef CONFIG_NET_PROFILE - start_bh_atomic(); - NET_PROFILE_ENTER(dev_queue_xmit); -#endif - - spin_lock_bh(&dev->xmit_lock); + /* Grab device queue */ + spin_lock_bh(&dev->queue_lock); q = dev->qdisc; if (q->enqueue) { q->enqueue(skb, q); - qdisc_wakeup(dev); - spin_unlock_bh(&dev->xmit_lock); -#ifdef CONFIG_NET_PROFILE - NET_PROFILE_LEAVE(dev_queue_xmit); - end_bh_atomic(); -#endif + /* If the device is not busy, kick it. + * Otherwise or if queue is not empty after kick, + * add it to run list. + */ + if (dev->tbusy || qdisc_restart(dev)) + qdisc_run(dev->qdisc); + spin_unlock_bh(&dev->queue_lock); return 0; } + spin_unlock_bh(&dev->queue_lock); /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... - Really, it is unlikely that bh protection is necessary here: - virtual devices do not generate EOI events. - However, it is possible, that they rely on bh protection + Really, it is unlikely that xmit_lock protection is necessary here. + (f.e. loopback and IP tunnels are clean ignoring statistics counters.) + However, it is possible, that they rely on protection made by us here. + + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags&IFF_UP) { if (netdev_nit) dev_queue_xmit_nit(skb,dev); - if (dev->hard_start_xmit(skb, dev) == 0) { - spin_unlock_bh(&dev->xmit_lock); - -#ifdef CONFIG_NET_PROFILE - NET_PROFILE_LEAVE(dev_queue_xmit); - end_bh_atomic(); -#endif - return 0; + local_bh_disable(); + if (dev->xmit_lock_owner != smp_processor_id()) { + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); + if (dev->hard_start_xmit(skb, dev) == 0) { + dev->xmit_lock_owner = -1; + spin_unlock_bh(&dev->xmit_lock); + return 0; + } + dev->xmit_lock_owner = -1; + spin_unlock_bh(&dev->xmit_lock); + if (net_ratelimit()) + printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); + } else { + /* Recursion is detected! It is possible, unfortunately */ + local_bh_enable(); + if (net_ratelimit()) + printk(KERN_DEBUG "Dead loop on virtual device %s, fix it urgently!\n", dev->name); } - if (net_ratelimit()) - printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); } - spin_unlock_bh(&dev->xmit_lock); kfree_skb(skb); - -#ifdef CONFIG_NET_PROFILE - NET_PROFILE_LEAVE(dev_queue_xmit); - end_bh_atomic(); -#endif - return 0; } @@ -660,9 +659,6 @@ int netdev_dropping = 0; int netdev_max_backlog = 300; atomic_t netdev_rx_dropped; -#ifdef CONFIG_CPU_IS_SLOW -int net_cpu_congestion; -#endif #ifdef CONFIG_NET_HW_FLOWCONTROL int netdev_throttle_events; @@ -852,14 +848,6 @@ struct packet_type *pt_prev; unsigned short type; unsigned long start_time = jiffies; -#ifdef CONFIG_CPU_IS_SLOW - static unsigned long start_busy = 0; - static unsigned long ave_busy = 0; - - if (start_busy == 0) - start_busy = start_time; - net_cpu_congestion = ave_busy>>8; -#endif NET_PROFILE_ENTER(net_bh); /* @@ -869,9 +857,9 @@ * latency on a transmit interrupt bh. */ - if (qdisc_head.forw != &qdisc_head) + if (qdisc_pending()) qdisc_run_queues(); - + /* * Any data left to process. This may occur because a * mark_bh() is done after we empty the queue including @@ -899,19 +887,6 @@ */ skb = skb_dequeue(&backlog); -#ifdef CONFIG_CPU_IS_SLOW - if (ave_busy > 128*16) { - kfree_skb(skb); - while ((skb = skb_dequeue(&backlog)) != NULL) - kfree_skb(skb); - break; - } -#endif - - -#if 0 - NET_PROFILE_SKB_PASSED(skb, net_bh_skb); -#endif #ifdef CONFIG_NET_FASTROUTE if (skb->pkt_type == PACKET_FASTROUTE) { dev_queue_xmit(skb); @@ -1022,16 +997,9 @@ * One last output flush. */ - if (qdisc_head.forw != &qdisc_head) + if (qdisc_pending()) qdisc_run_queues(); -#ifdef CONFIG_CPU_IS_SLOW - if (1) { - unsigned long start_idle = jiffies; - ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); - start_busy = 0; - } -#endif #ifdef CONFIG_NET_HW_FLOWCONTROL if (netdev_dropping) netdev_wakeup(); @@ -1065,14 +1033,6 @@ */ /* - * This call is useful, but I'd remove it too. - * - * The reason is purely aestetical, it is the only call - * from SIOC* family using struct ifreq in reversed manner. - * Besides that, it is pretty silly to put "drawing" facility - * to kernel, it is useful only to print ifindices - * in readable form, is not it? --ANK - * * We need this ioctl for efficient implementation of the * if_indextoname() function required by the IPv6 API. Without * it, we would have to search all the interfaces to find a @@ -1138,7 +1098,7 @@ */ total = 0; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { for (i=0; inext) { size = sprintf_stats(buffer+len, dev); len+=size; @@ -1245,7 +1205,7 @@ if(pos>offset+length) break; } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); *start=buffer+(offset-begin); /* Start of wanted data */ len-=(offset-begin); /* Start slop */ @@ -1347,7 +1307,7 @@ pos+=size; len+=size; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for(dev = dev_base; dev != NULL; dev = dev->next) { size = sprintf_wireless_stats(buffer+len, dev); len+=size; @@ -1360,7 +1320,7 @@ if(pos > offset + length) break; } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); *start = buffer + (offset - begin); /* Start of wanted data */ len -= (offset - begin); /* Start slop */ @@ -1736,11 +1696,10 @@ if (IW_IS_SET(cmd)) { if (!suser()) return -EPERM; - rtnl_lock(); } + rtnl_lock(); ret = dev_ifsioc(&ifr, cmd); - if (IW_IS_SET(cmd)) - rtnl_unlock(); + rtnl_unlock(); if (!ret && IW_IS_GET(cmd) && copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; @@ -1769,6 +1728,10 @@ { struct device *d, **dp; + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->xmit_lock); + dev->xmit_lock_owner = -1; + if (dev_boot_phase) { /* This is NOT bug, but I am not sure, that all the devices, initialized before netdev module is started @@ -1784,14 +1747,13 @@ printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name); /* Check for existence, and append to tail of chain */ - write_lock_bh(&dev_base_lock); for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { if (d == dev || strcmp(d->name, dev->name) == 0) { - write_unlock_bh(&dev_base_lock); return -EEXIST; } } dev->next = NULL; + write_lock_bh(&dev_base_lock); *dp = dev; write_unlock_bh(&dev_base_lock); return 0; @@ -1803,24 +1765,22 @@ if (dev->init && dev->init(dev) != 0) return -EIO; + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + /* Check for existence, and append to tail of chain */ - write_lock_bh(&dev_base_lock); for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { if (d == dev || strcmp(d->name, dev->name) == 0) { - write_unlock_bh(&dev_base_lock); return -EEXIST; } } dev->next = NULL; dev_init_scheduler(dev); + write_lock_bh(&dev_base_lock); *dp = dev; write_unlock_bh(&dev_base_lock); - dev->ifindex = -1; - dev->ifindex = dev_new_index(); - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - /* Notify protocols, that a new device appeared. */ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); @@ -1831,15 +1791,35 @@ { struct device *d, **dp; - if (dev_boot_phase == 0) { - /* If device is running, close it. - It is very bad idea, really we should - complain loudly here, but random hackery - in linux/drivers/net likes it. - */ - if (dev->flags & IFF_UP) - dev_close(dev); + /* If device is running, close it first. */ + if (dev->flags & IFF_UP) + dev_close(dev); + /* And unlink it from device chain. */ + for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev) { + write_lock_bh(&dev_base_lock); + *dp = d->next; + write_unlock_bh(&dev_base_lock); + + /* Sorry. It is known "feature". The race is clear. + Keep it after device reference counting will + be complete. + */ + synchronize_bh(); + break; + } + } + if (d == NULL) + return -ENODEV; + + /* It is "synchronize_bh" to those of guys, who overslept + in skb_alloc/page fault etc. that device is off-line. + Again, it can be removed only if devices are refcounted. + */ + dev_lock_wait(); + + if (dev_boot_phase == 0) { #ifdef CONFIG_NET_FASTROUTE dev_clear_fastroute(dev); #endif @@ -1856,27 +1836,11 @@ * Flush the multicast chain */ dev_mc_discard(dev); - - /* To avoid pointers looking to nowhere, - we wait for end of critical section */ - dev_lock_wait(); } - /* And unlink it from device chain. */ - write_lock_bh(&dev_base_lock); - for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { - if (d == dev) { - *dp = d->next; - d->next = NULL; - write_unlock_bh(&dev_base_lock); - - if (dev->destructor) - dev->destructor(dev); - return 0; - } - } - write_unlock_bh(&dev_base_lock); - return -ENODEV; + if (dev->destructor) + dev->destructor(dev); + return 0; } @@ -2018,16 +1982,24 @@ * If the call to dev->init fails, the dev is removed * from the chain disconnecting the device until the * next reboot. + * + * NB At boot phase networking is dead. No locking is required. + * But we still preserve dev_base_lock for sanity. */ dp = &dev_base; while ((dev = *dp) != NULL) { + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->xmit_lock); + dev->xmit_lock_owner = -1; dev->iflink = -1; if (dev->init && dev->init(dev)) { /* * It failed to come up. Unhook it. */ + write_lock_bh(&dev_base_lock); *dp = dev->next; + write_unlock_bh(&dev_base_lock); } else { dp = &dev->next; dev->ifindex = dev_new_index(); @@ -2055,6 +2027,7 @@ dev_boot_phase = 0; + dst_init(); dev_mcast_init(); #ifdef CONFIG_IP_PNP diff -ur ../vger3-990605/linux/net/core/dev_mcast.c linux/net/core/dev_mcast.c --- ../vger3-990605/linux/net/core/dev_mcast.c Fri May 28 19:52:51 1999 +++ linux/net/core/dev_mcast.c Sat Jun 5 19:52:13 1999 @@ -58,7 +58,11 @@ * * Device mc lists are changed by bh at least if IPv6 is enabled, * so that it must be bh protected. + * + * We protect all mc lists with global rw lock + * and block accesses to device mc filters with dev->xmit_lock. */ +static rwlock_t dev_mc_lock = RW_LOCK_UNLOCKED; /* * Update the multicast list into the physical NIC controller. @@ -69,7 +73,7 @@ /* Don't do anything till we up the interface [dev_open will call this function so the list will stay sane] */ - + if(!(dev->flags&IFF_UP)) return; @@ -80,11 +84,15 @@ if(dev->set_multicast_list==NULL) return; - start_bh_atomic(); + read_lock_bh(&dev_mc_lock); + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); dev->set_multicast_list(dev); - end_bh_atomic(); + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + read_unlock_bh(&dev_mc_lock); } - + /* * Delete a device level multicast */ @@ -94,7 +102,7 @@ int err = 0; struct dev_mc_list *dmi, **dmip; - start_bh_atomic(); + write_lock_bh(&dev_mc_lock); for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) { /* * Find the entry we want to delete. The device could @@ -120,14 +128,15 @@ * We have altered the list, so the card * loaded filter is now wrong. Fix it */ - end_bh_atomic(); + write_unlock_bh(&dev_mc_lock); + dev_mc_upload(dev); return 0; } } err = -ENOENT; done: - end_bh_atomic(); + write_unlock_bh(&dev_mc_lock); return err; } @@ -140,9 +149,12 @@ int err = 0; struct dev_mc_list *dmi, *dmi1; + /* RED-PEN: does gfp_any() work now? It requires + true local_bh_disable rather than global. + */ dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any()); - start_bh_atomic(); + write_lock_bh(&dev_mc_lock); for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) { if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) { if (glbl) { @@ -156,8 +168,10 @@ } } - if ((dmi=dmi1)==NULL) + if ((dmi=dmi1)==NULL) { + write_unlock_bh(&dev_mc_lock); return -ENOMEM; + } memcpy(dmi->dmi_addr, addr, alen); dmi->dmi_addrlen=alen; dmi->next=dev->mc_list; @@ -165,12 +179,12 @@ dmi->dmi_gusers=glbl ? 1 : 0; dev->mc_list=dmi; dev->mc_count++; - end_bh_atomic(); + write_unlock_bh(&dev_mc_lock); dev_mc_upload(dev); return 0; done: - end_bh_atomic(); + write_unlock_bh(&dev_mc_lock); if (dmi1) kfree(dmi1); return err; @@ -182,7 +196,7 @@ void dev_mc_discard(struct device *dev) { - start_bh_atomic(); + write_lock_bh(&dev_mc_lock); while (dev->mc_list!=NULL) { struct dev_mc_list *tmp=dev->mc_list; dev->mc_list=tmp->next; @@ -191,7 +205,7 @@ kfree_s(tmp,sizeof(*tmp)); } dev->mc_count=0; - end_bh_atomic(); + write_unlock_bh(&dev_mc_lock); } #ifdef CONFIG_PROC_FS @@ -203,8 +217,9 @@ int len=0; struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { + read_lock_bh(&dev_mc_lock); for (m = dev->mc_list; m; m = m->next) { int i; @@ -221,14 +236,17 @@ len=0; begin=pos; } - if (pos > offset+length) + if (pos > offset+length) { + read_unlock_bh(&dev_mc_lock); goto done; + } } + read_unlock_bh(&dev_mc_lock); } *eof = 1; done: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); *start=buffer+(offset-begin); len-=(offset-begin); if(len>length) diff -ur ../vger3-990605/linux/net/core/dst.c linux/net/core/dst.c --- ../vger3-990605/linux/net/core/dst.c Fri May 28 19:52:52 1999 +++ linux/net/core/dst.c Sun Jun 6 18:45:08 1999 @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -39,16 +40,16 @@ static struct timer_list dst_gc_timer = { NULL, NULL, DST_GC_MIN, 0L, dst_run_gc }; -#if RT_CACHE_DEBUG >= 2 -atomic_t hh_count; -#endif static void dst_run_gc(unsigned long dummy) { int delayed = 0; struct dst_entry * dst, **dstp; - spin_lock(&dst_lock); + if (!spin_trylock(&dst_lock)) { + mod_timer(&dst_gc_timer, jiffies + HZ/10); + return; + } del_timer(&dst_gc_timer); dstp = &dst_garbage_list; @@ -159,4 +160,37 @@ dst->ops->destroy(dst); atomic_dec(&dst_total); kfree(dst); +} + +static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + struct dst_entry *dst; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + spin_lock_bh(&dst_lock); + for (dst = dst_garbage_list; dst; dst = dst->next) { + if (dst->dev == dev) { + dst->input = dst_discard; + dst->output = dst_blackhole; + dst->dev = &loopback_dev; + } + } + spin_unlock_bh(&dst_lock); + break; + } + return NOTIFY_DONE; +} + +struct notifier_block dst_dev_notifier = { + dst_dev_event, + NULL, + 0 +}; + +__initfunc(void dst_init(void)) +{ + register_netdevice_notifier(&dst_dev_notifier); } diff -ur ../vger3-990605/linux/net/core/neighbour.c linux/net/core/neighbour.c --- ../vger3-990605/linux/net/core/neighbour.c Thu Jun 3 18:34:52 1999 +++ linux/net/core/neighbour.c Sun Jun 6 19:48:16 1999 @@ -12,12 +12,10 @@ * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. - * Horst von Brand Add #include */ #include #include -#include #include #include #include @@ -30,33 +28,6 @@ #include #include -/* - NOTE. The most unpleasent question is serialization of - accesses to resolved addresses. The problem is that addresses - are modified by bh, but they are referenced from normal - kernel thread. Before today no locking was made. - My reasoning was that corrupted address token will be copied - to packet with cosmologically small probability - (it is even difficult to estimate such small number) - and it is very silly to waste cycles in fast path to lock them. - - But now I changed my mind, but not because previous statement - is wrong. Actually, neigh->ha MAY BE not opaque byte array, - but reference to some private data. In this case even neglibible - corruption probability becomes bug. - - - hh cache is protected by rwlock. It assumes that - hh cache update procedure is short and fast, and that - read_lock is cheaper than start_bh_atomic(). - - ha tokens, saved in neighbour entries, are protected - by bh_atomic(). - - no protection is made in /proc reading. It is OK, because - /proc is broken by design in any case, and - corrupted output is normal behaviour there. - - --ANK (981025) - */ - #define NEIGH_DEBUG 1 #define NEIGH_PRINTK(x...) printk(x) @@ -83,6 +54,46 @@ static int neigh_glbl_allocs; static struct neigh_table *neigh_tables; +#if defined(__i386__) && defined(__SMP__) +#define ASSERT_WL(n) if ((int)((n)->lock.lock) >= 0) { printk("WL assertion failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } +#else +#define ASSERT_WL(n) do { } while(0) +#endif + +/* + Neighbour hash table buckets are protected with rwlock tbl->lock. + + - All the scans/updates to hash buckets MUST be made under this lock. + - NOTHING clever should be made under this lock: no callbacks + to protocol backends, no attempts to send something to network. + It will result in deadlocks, if backend/driver wants to use neighbour + cache. + - If the entry requires some non-trivial actions, increase + its reference count and release table lock. + + Neighbour entries are protected: + - with reference count. + - with rwlock neigh->lock + + Reference count prevents destruction. + + neigh->lock mainly serializes ll address data and its validity state. + However, the same lock is used to protect another entry fields: + - timer + - resolution queue + + Again, nothing clever shall be made under neigh->lock, + the most complicated procedure, which we allow is dev->hard_header. + It is supposed, that dev->hard_header is simplistic and does + not make callbacks to neighbour tables. + + The last lock is neigh_tbl_lock. It is pure SMP lock, protecting + list of neighbour tables. This list is used only in process context, + so that this lock is useless with big kernel lock. + */ + +static rwlock_t neigh_tbl_lock = RW_LOCK_UNLOCKED; + static int neigh_blackhole(struct sk_buff *skb) { kfree_skb(skb); @@ -106,13 +117,11 @@ int shrunk = 0; int i; - if (atomic_read(&tbl->lock)) - return 0; - for (i=0; i<=NEIGH_HASHMASK; i++) { struct neighbour *n, **np; np = &tbl->hash_buckets[i]; + write_lock_bh(&tbl->lock); while ((n = *np) != NULL) { /* Neighbour record may be discarded if: - nobody refers to it. @@ -124,6 +133,7 @@ It is not clear, what is better table overflow or flooding. */ + write_lock(&n->lock); if (atomic_read(&n->refcnt) == 0 && !(n->nud_state&NUD_PERMANENT) && (n->nud_state != NUD_INCOMPLETE || @@ -132,11 +142,14 @@ n->tbl = NULL; tbl->entries--; shrunk = 1; + write_unlock(&n->lock); neigh_destroy(n); continue; } + write_unlock(&n->lock); np = &n->next; } + write_unlock_bh(&tbl->lock); } tbl->last_flush = jiffies; @@ -147,12 +160,8 @@ { int i; - if (atomic_read(&tbl->lock)) { - NEIGH_PRINTK1("neigh_ifdown: impossible event 1763\n"); - return -EBUSY; - } + write_lock_bh(&tbl->lock); - start_bh_atomic(); for (i=0; i<=NEIGH_HASHMASK; i++) { struct neighbour *n, **np; @@ -163,6 +172,7 @@ continue; } *np = n->next; + write_lock(&n->lock); n->tbl = NULL; tbl->entries--; if (atomic_read(&n->refcnt)) { @@ -185,33 +195,32 @@ else n->nud_state = NUD_NONE; NEIGH_PRINTK2("neigh %p is stray.\n", n); - } else + write_unlock(&n->lock); + } else { + write_unlock(&n->lock); neigh_destroy(n); + } } } del_timer(&tbl->proxy_timer); skb_queue_purge(&tbl->proxy_queue); pneigh_ifdown(tbl, dev); - end_bh_atomic(); + write_unlock_bh(&tbl->lock); return 0; } -static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) +static struct neighbour *neigh_alloc(struct neigh_table *tbl) { struct neighbour *n; unsigned long now = jiffies; - if (tbl->entries > tbl->gc_thresh1) { - if (creat < 0) + if (tbl->entries > tbl->gc_thresh3 || + (tbl->entries > tbl->gc_thresh2 && + now - tbl->last_flush > 5*HZ)) { + if (neigh_forced_gc(tbl) == 0 && + tbl->entries > tbl->gc_thresh3) return NULL; - if (tbl->entries > tbl->gc_thresh3 || - (tbl->entries > tbl->gc_thresh2 && - now - tbl->last_flush > 5*HZ)) { - if (neigh_forced_gc(tbl) == 0 && - tbl->entries > tbl->gc_thresh3) - return NULL; - } } n = kmalloc(tbl->entry_size, GFP_ATOMIC); @@ -221,6 +230,7 @@ memset(n, 0, tbl->entry_size); skb_queue_head_init(&n->arp_queue); + n->lock = RW_LOCK_UNLOCKED; n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; @@ -233,9 +243,8 @@ return n; } - -struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, - struct device *dev, int creat) +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev) { struct neighbour *n; u32 hash_val; @@ -247,17 +256,26 @@ hash_val ^= hash_val>>3; hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { if (dev == n->dev && memcmp(n->primary_key, pkey, key_len) == 0) { atomic_inc(&n->refcnt); - return n; + break; } } - if (!creat) - return NULL; + read_unlock_bh(&tbl->lock); + return n; +} - n = neigh_alloc(tbl, creat); +struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey, + struct device *dev) +{ + struct neighbour *n, *n1; + u32 hash_val; + int key_len = tbl->key_len; + + n = neigh_alloc(tbl); if (n == NULL) return NULL; @@ -277,11 +295,30 @@ } n->confirmed = jiffies - (n->parms->base_reachable_time<<1); - atomic_set(&n->refcnt, 1); + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>3; + hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + + write_lock_bh(&tbl->lock); + for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + if (dev == n1->dev && + memcmp(n1->primary_key, pkey, key_len) == 0) { + atomic_inc(&n1->refcnt); + write_unlock_bh(&tbl->lock); + neigh_destroy(n); + return n1; + } + } + tbl->entries++; + n->tbl = tbl; + atomic_set(&n->refcnt, 1); n->next = tbl->hash_buckets[hash_val]; tbl->hash_buckets[hash_val] = n; - n->tbl = tbl; + write_unlock_bh(&tbl->lock); NEIGH_PRINTK2("neigh %p is created.\n", n); return n; } @@ -393,7 +430,9 @@ while ((hh = neigh->hh) != NULL) { neigh->hh = hh->hh_next; hh->hh_next = NULL; + write_lock_bh(&hh->hh_lock); hh->hh_output = neigh_blackhole; + write_unlock_bh(&hh->hh_lock); if (atomic_dec_and_test(&hh->hh_refcnt)) kfree(hh); } @@ -411,6 +450,8 @@ /* Neighbour state is suspicious; disable fast path. + + Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { @@ -418,6 +459,8 @@ NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh); + ASSERT_WL(neigh); + neigh->output = neigh->ops->output; for (hh = neigh->hh; hh; hh = hh->hh_next) @@ -426,6 +469,8 @@ /* Neighbour state is OK; enable fast path. + + Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { @@ -433,6 +478,8 @@ NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + ASSERT_WL(neigh); + neigh->output = neigh->ops->connected_output; for (hh = neigh->hh; hh; hh = hh->hh_next) @@ -448,6 +495,8 @@ If a routine wants to know TRUE entry state, it calls neigh_sync before checking state. + + Called with write_locked neigh. */ static void neigh_sync(struct neighbour *n) @@ -455,6 +504,7 @@ unsigned long now = jiffies; u8 state = n->nud_state; + ASSERT_WL(n); if (state&(NUD_NOARP|NUD_PERMANENT)) return; if (state&NUD_REACHABLE) { @@ -478,11 +528,8 @@ unsigned long now = jiffies; int i; - if (atomic_read(&tbl->lock)) { - tbl->gc_timer.expires = now + 1*HZ; - add_timer(&tbl->gc_timer); - return; - } + + write_lock(&tbl->lock); /* * periodicly recompute ReachableTime from random function @@ -500,10 +547,15 @@ np = &tbl->hash_buckets[i]; while ((n = *np) != NULL) { - unsigned state = n->nud_state; + unsigned state; + + write_lock(&n->lock); - if (state&(NUD_PERMANENT|NUD_IN_TIMER)) + state = n->nud_state; + if (state&(NUD_PERMANENT|NUD_IN_TIMER)) { + write_unlock(&n->lock); goto next_elt; + } if ((long)(n->used - n->confirmed) < 0) n->used = n->confirmed; @@ -514,6 +566,7 @@ n->tbl = NULL; n->next = NULL; tbl->entries--; + write_unlock(&n->lock); neigh_destroy(n); continue; } @@ -523,6 +576,7 @@ n->nud_state = NUD_STALE; neigh_suspect(n); } + write_unlock(&n->lock); next_elt: np = &n->next; @@ -531,6 +585,7 @@ tbl->gc_timer.expires = now + tbl->gc_interval; add_timer(&tbl->gc_timer); + write_unlock(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -546,11 +601,17 @@ { unsigned long now = jiffies; struct neighbour *neigh = (struct neighbour*)arg; - unsigned state = neigh->nud_state; + unsigned state; + int notify = 0; + + write_lock(&neigh->lock); + atomic_inc(&neigh->refcnt); + + state = neigh->nud_state; if (!(state&NUD_IN_TIMER)) { NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n"); - return; + goto out; } if ((state&NUD_VALID) && @@ -558,18 +619,19 @@ neigh->nud_state = NUD_REACHABLE; NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); neigh_connect(neigh); - return; + goto out; } if (state == NUD_DELAY) { NEIGH_PRINTK2("neigh %p is probed.\n", neigh); neigh->nud_state = NUD_PROBE; - neigh->probes = 0; + atomic_set(&neigh->probes, 0); } - if (neigh->probes >= neigh_max_probes(neigh)) { + if (atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { struct sk_buff *skb; neigh->nud_state = NUD_FAILED; + notify = 1; neigh->tbl->stats.res_failed++; NEIGH_PRINTK2("neigh %p is failed.\n", neigh); @@ -578,44 +640,60 @@ So that, we try to be accurate and avoid dead loop. --ANK */ - while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } skb_queue_purge(&neigh->arp_queue); - return; + goto out; } neigh->timer.expires = now + neigh->parms->retrans_time; add_timer(&neigh->timer); + write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); - neigh->probes++; + atomic_inc(&neigh->probes); + neigh_release(neigh); + return; + +out: + write_unlock(&neigh->lock); +#ifdef CONFIG_ARPD + if (notify && neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { - start_bh_atomic(); + write_lock_bh(&neigh->lock); if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) { if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) { if (neigh->tbl == NULL) { NEIGH_PRINTK2("neigh %p used after death.\n", neigh); if (skb) kfree_skb(skb); - end_bh_atomic(); + write_unlock_bh(&neigh->lock); return 1; } if (neigh->parms->mcast_probes + neigh->parms->app_probes) { - neigh->probes = neigh->parms->ucast_probes; + atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; neigh->timer.expires = jiffies + neigh->parms->retrans_time; add_timer(&neigh->timer); - + write_unlock_bh(&neigh->lock); neigh->ops->solicit(neigh, skb); - neigh->probes++; + atomic_inc(&neigh->probes); + write_lock_bh(&neigh->lock); } else { neigh->nud_state = NUD_FAILED; + write_unlock_bh(&neigh->lock); + if (skb) kfree_skb(skb); - end_bh_atomic(); return 1; } } @@ -629,7 +707,7 @@ } __skb_queue_head(&neigh->arp_queue, skb); } - end_bh_atomic(); + write_unlock_bh(&neigh->lock); return 1; } if (neigh->nud_state == NUD_STALE) { @@ -639,7 +717,7 @@ add_timer(&neigh->timer); } } - end_bh_atomic(); + write_unlock_bh(&neigh->lock); return 0; } @@ -651,9 +729,9 @@ if (update) { for (hh=neigh->hh; hh; hh=hh->hh_next) { - write_lock_irq(&hh->hh_lock); + write_lock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); - write_unlock_irq(&hh->hh_lock); + write_unlock_bh(&hh->hh_lock); } } } @@ -665,15 +743,23 @@ -- new is new state. -- override==1 allows to override existing lladdr, if it is different. -- arp==0 means that the change is administrative. + + Caller MUST hold reference count on the entry. */ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) { - u8 old = neigh->nud_state; + u8 old; + int err; + int notify = 0; struct device *dev = neigh->dev; + write_lock_bh(&neigh->lock); + old = neigh->nud_state; + + err = -EPERM; if (arp && (old&(NUD_NOARP|NUD_PERMANENT))) - return -EPERM; + goto out; if (!(new&NUD_VALID)) { if (old&NUD_IN_TIMER) @@ -681,7 +767,9 @@ if (old&NUD_CONNECTED) neigh_suspect(neigh); neigh->nud_state = new; - return 0; + err = 0; + notify = old&NUD_VALID; + goto out; } /* Compare new lladdr with cached one */ @@ -698,14 +786,15 @@ if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0) lladdr = neigh->ha; else if (!override) - return -EPERM; + goto out; } } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ + err = -EINVAL; if (!(old&NUD_VALID)) - return -EINVAL; + goto out; lladdr = neigh->ha; } @@ -718,10 +807,11 @@ /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ + err = 0; if (old&NUD_VALID) { if (lladdr == neigh->ha) if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED))) - return 0; + goto out; } if (old&NUD_IN_TIMER) del_timer(&neigh->timer); @@ -731,12 +821,11 @@ neigh_update_hhs(neigh); neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1); #ifdef CONFIG_ARPD - if (neigh->parms->app_probes) - neigh_app_notify(neigh); + notify = 1; #endif } if (new == old) - return 0; + goto out; if (new&NUD_CONNECTED) neigh_connect(neigh); else @@ -749,14 +838,22 @@ while (neigh->nud_state&NUD_VALID && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) { struct neighbour *n1 = neigh; + write_unlock_bh(&neigh->lock); /* On shaper/eql skb->dst->neighbour != neigh :( */ if (skb->dst && skb->dst->neighbour) n1 = skb->dst->neighbour; n1->output(skb); + write_lock_bh(&neigh->lock); } skb_queue_purge(&neigh->arp_queue); } - return 0; +out: + write_unlock_bh(&neigh->lock); +#ifdef CONFIG_ARPD + if (notify && neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + return err; } struct neighbour * neigh_event_ns(struct neigh_table *tbl, @@ -839,15 +936,15 @@ int err; struct device *dev = neigh->dev; if (dev->hard_header_cache && dst->hh == NULL) { - start_bh_atomic(); + write_lock_bh(&neigh->lock); if (dst->hh == NULL) neigh_hh_init(neigh, dst, dst->ops->protocol); err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); - end_bh_atomic(); + write_unlock_bh(&neigh->lock); } else { - start_bh_atomic(); + read_lock_bh(&neigh->lock); err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); - end_bh_atomic(); + read_unlock_bh(&neigh->lock); } if (err >= 0) return neigh->ops->queue_xmit(skb); @@ -873,9 +970,9 @@ __skb_pull(skb, skb->nh.raw - skb->data); - start_bh_atomic(); + read_lock_bh(&neigh->lock); err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); - end_bh_atomic(); + read_unlock_bh(&neigh->lock); if (err >= 0) return neigh->ops->queue_xmit(skb); kfree_skb(skb); @@ -949,8 +1046,10 @@ return NULL; } } + write_lock_bh(&tbl->lock); p->next = tbl->parms.next; tbl->parms.next = p; + write_unlock_bh(&tbl->lock); } return p; } @@ -961,10 +1060,11 @@ if (parms == NULL || parms == &tbl->parms) return; + write_lock_bh(&tbl->lock); for (p = &tbl->parms.next; *p; p = &(*p)->next) { if (*p == parms) { *p = parms->next; - synchronize_bh(); + write_unlock_bh(&tbl->lock); #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(parms); #endif @@ -972,6 +1072,7 @@ return; } } + write_unlock_bh(&tbl->lock); NEIGH_PRINTK1("neigh_release_parms: not found\n"); } @@ -983,6 +1084,7 @@ tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); init_timer(&tbl->gc_timer); + tbl->lock = RW_LOCK_UNLOCKED; tbl->gc_timer.data = (unsigned long)tbl; tbl->gc_timer.function = neigh_periodic_timer; tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time; @@ -995,29 +1097,30 @@ tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time*20; + write_lock(&neigh_tbl_lock); tbl->next = neigh_tables; neigh_tables = tbl; + write_unlock(&neigh_tbl_lock); } int neigh_table_clear(struct neigh_table *tbl) { struct neigh_table **tp; - start_bh_atomic(); del_timer(&tbl->gc_timer); del_timer(&tbl->proxy_timer); skb_queue_purge(&tbl->proxy_queue); neigh_ifdown(tbl, NULL); - end_bh_atomic(); if (tbl->entries) printk(KERN_CRIT "neighbour leakage\n"); + write_lock(&neigh_tbl_lock); for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { if (*tp == tbl) { *tp = tbl->next; - synchronize_bh(); break; } } + write_unlock(&neigh_tbl_lock); #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&tbl->parms); #endif @@ -1039,12 +1142,14 @@ return -ENODEV; } + read_lock(&neigh_tbl_lock); for (tbl=neigh_tables; tbl; tbl = tbl->next) { int err = 0; struct neighbour *n; if (tbl->family != ndm->ndm_family) continue; + read_unlock(&neigh_tbl_lock); if (nda[NDA_DST-1] == NULL || nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) @@ -1056,15 +1161,14 @@ if (dev == NULL) return -EINVAL; - start_bh_atomic(); - n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0); + n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); if (n) { err = neigh_update(n, NULL, NUD_FAILED, 1, 0); neigh_release(n); } - end_bh_atomic(); return err; } + read_unlock(&neigh_tbl_lock); return -EADDRNOTAVAIL; } @@ -1081,12 +1185,15 @@ return -ENODEV; } + read_lock(&neigh_tbl_lock); for (tbl=neigh_tables; tbl; tbl = tbl->next) { int err = 0; struct neighbour *n; if (tbl->family != ndm->ndm_family) continue; + read_unlock(&neigh_tbl_lock); + if (nda[NDA_DST-1] == NULL || nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) return -EINVAL; @@ -1100,8 +1207,7 @@ if (nda[NDA_LLADDR-1] != NULL && nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) return -EINVAL; - start_bh_atomic(); - n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0); + n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); if (n) { if (nlh->nlmsg_flags&NLM_F_EXCL) err = -EEXIST; @@ -1119,9 +1225,9 @@ } if (n) neigh_release(n); - end_bh_atomic(); return err; } + read_unlock(&neigh_tbl_lock); return -EADDRNOTAVAIL; } @@ -1141,15 +1247,17 @@ ndm->ndm_family = n->ops->family; ndm->ndm_flags = n->flags; ndm->ndm_type = n->type; - ndm->ndm_state = n->nud_state; ndm->ndm_ifindex = n->dev->ifindex; RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + read_lock_bh(&n->lock); + ndm->ndm_state = n->nud_state; if (n->nud_state&NUD_VALID) RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); ci.ndm_used = now - n->used; ci.ndm_confirmed = now - n->confirmed; ci.ndm_updated = now - n->updated; ci.ndm_refcnt = atomic_read(&n->refcnt); + read_unlock_bh(&n->lock); RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1173,20 +1281,20 @@ if (h < s_h) continue; if (h > s_h) s_idx = 0; - start_bh_atomic(); + read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { if (idx < s_idx) continue; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH) <= 0) { - end_bh_atomic(); + read_unlock_bh(&tbl->lock); cb->args[1] = h; cb->args[2] = idx; return -1; } } - end_bh_atomic(); + read_unlock_bh(&tbl->lock); } cb->args[1] = h; @@ -1203,6 +1311,7 @@ s_t = cb->args[0]; + read_lock(&neigh_tbl_lock); for (tbl=neigh_tables, t=0; tbl; tbl = tbl->next, t++) { if (t < s_t) continue; if (family && tbl->family != family) @@ -1212,6 +1321,7 @@ if (neigh_dump_table(tbl, skb, cb) < 0) break; } + read_unlock(&neigh_tbl_lock); cb->args[0] = t; diff -ur ../vger3-990605/linux/net/core/rtnetlink.c linux/net/core/rtnetlink.c --- ../vger3-990605/linux/net/core/rtnetlink.c Fri May 28 19:52:55 1999 +++ linux/net/core/rtnetlink.c Sun Jun 6 18:45:05 1999 @@ -50,22 +50,22 @@ #include #include -atomic_t rtnl_rlockct; -DECLARE_WAIT_QUEUE_HEAD(rtnl_wait); +DECLARE_MUTEX(rtnl_sem); - -void rtnl_lock() +void rtnl_lock(void) { rtnl_shlock(); rtnl_exlock(); } - -void rtnl_unlock() + +void rtnl_unlock(void) { rtnl_exunlock(); rtnl_shunlock(); } + + int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) { memset(tb, 0, sizeof(struct rtattr*)*maxattr); @@ -82,8 +82,6 @@ #ifdef CONFIG_RTNETLINK struct sock *rtnl; -unsigned long rtnl_wlockct; - struct rtnetlink_link * rtnetlink_links[NPROTO]; #define _S 1 /* superuser privileges required */ @@ -189,14 +187,14 @@ int s_idx = cb->args[0]; struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq) <= 0) break; } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); cb->args[0] = idx; return skb->len; @@ -218,9 +216,7 @@ continue; if (idx > s_idx) memset(&cb->args[0], 0, sizeof(cb->args)); - if (rtnetlink_links[idx][type].dumpit(skb, cb) == 0) - continue; - if (skb_tailroom(skb) < 256) + if (rtnetlink_links[idx][type].dumpit(skb, cb)) break; } cb->family = idx; @@ -247,8 +243,6 @@ static int rtnetlink_done(struct netlink_callback *cb) { - if (cap_raised(NETLINK_CB(cb->skb).eff_cap, CAP_NET_ADMIN) && cb->nlh->nlmsg_flags&NLM_F_ATOMIC) - rtnl_shunlock(); return 0; } @@ -316,15 +310,9 @@ if (link->dumpit == NULL) goto err_inval; - /* Super-user locks all the tables to get atomic snapshot */ - if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) - && nlh->nlmsg_flags&NLM_F_ATOMIC) - atomic_inc(&rtnl_rlockct); if ((*errp = netlink_dump_start(rtnl, skb, nlh, link->dumpit, rtnetlink_done)) != 0) { - if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) && nlh->nlmsg_flags&NLM_F_ATOMIC) - atomic_dec(&rtnl_rlockct); return -1; } rlen = NLMSG_ALIGN(nlh->nlmsg_len); diff -ur ../vger3-990605/linux/net/ethernet/eth.c linux/net/ethernet/eth.c --- ../vger3-990605/linux/net/ethernet/eth.c Tue Jun 1 18:05:17 1999 +++ linux/net/ethernet/eth.c Sat Jun 5 19:52:13 1999 @@ -248,6 +248,7 @@ eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, dev->addr_len); memcpy(eth->h_dest, neigh->ha, dev->addr_len); + hh->hh_len = ETH_HLEN; return 0; } diff -ur ../vger3-990605/linux/net/ipv4/arp.c linux/net/ipv4/arp.c --- ../vger3-990605/linux/net/ipv4/arp.c Sun Mar 21 17:29:19 1999 +++ linux/net/ipv4/arp.c Sun Jun 6 18:45:05 1999 @@ -119,6 +119,11 @@ #include #include +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +static char *ax2asc2(ax25_address *a, char *buf); +#endif + + /* * Interface to generic neighbour cache. */ @@ -304,7 +309,7 @@ u8 *dst_ha = NULL; struct device *dev = neigh->dev; u32 target = *(u32*)neigh->primary_key; - int probes = neigh->probes; + int probes = atomic_read(&neigh->probes); if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) saddr = skb->nh.iph->saddr; @@ -315,6 +320,7 @@ if (!(neigh->nud_state&NUD_VALID)) printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; + read_lock_bh(&neigh->lock); } else if ((probes -= neigh->parms->app_probes) < 0) { #ifdef CONFIG_ARPD neigh_app_ns(neigh); @@ -324,6 +330,8 @@ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_ha, dev->dev_addr, NULL); + if (dst_ha) + read_unlock_bh(&neigh->lock); } /* OBSOLETE FUNCTIONS */ @@ -372,29 +380,25 @@ if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) return 0; - start_bh_atomic(); n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); if (n) { n->used = jiffies; if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { - memcpy(haddr, n->ha, dev->addr_len); + read_lock_bh(&n->lock); + memcpy(haddr, n->ha, dev->addr_len); + read_unlock_bh(&n->lock); neigh_release(n); - end_bh_atomic(); return 0; } + neigh_release(n); } else kfree_skb(skb); - neigh_release(n); - end_bh_atomic(); return 1; } /* END OF OBSOLETE FUNCTIONS */ -/* - * Note: requires bh_atomic locking. - */ int arp_bind_neighbour(struct dst_entry *dst) { struct device *dev = dst->dev; @@ -672,7 +676,8 @@ (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); - neigh_release(n); + if (n) + neigh_release(n); if (skb->stamp.tv_sec == 0 || skb->pkt_type == PACKET_HOST || @@ -785,7 +790,6 @@ return -EINVAL; err = -ENOBUFS; - start_bh_atomic(); neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1); if (neigh) { unsigned state = NUD_STALE; @@ -795,7 +799,6 @@ r->arp_ha.sa_data : NULL, state, 1, 0); neigh_release(neigh); } - end_bh_atomic(); return err; } @@ -819,17 +822,17 @@ struct neighbour *neigh; int err = -ENXIO; - start_bh_atomic(); - neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { + read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); r->arp_ha.sa_family = dev->type; strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); - r->arp_flags = arp_state_to_flags(neigh); neigh_release(neigh); err = 0; } - end_bh_atomic(); return err; } @@ -867,14 +870,12 @@ return -EINVAL; } err = -ENXIO; - start_bh_atomic(); - neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { if (neigh->nud_state&~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0); neigh_release(neigh); } - end_bh_atomic(); return err; } @@ -961,16 +962,16 @@ char hbuffer[HBUFFERLEN]; int i,j,k; const char hexbuf[] = "0123456789ABCDEF"; + char abuf[16]; size = sprintf(buffer,"IP address HW type Flags HW address Mask Device\n"); pos+=size; len+=size; - neigh_table_lock(&arp_tbl); - - for(i=0; i<=NEIGH_HASHMASK; i++) { + for(i=0; i<=NEIGH_HASHMASK; i++) { struct neighbour *n; + read_lock_bh(&arp_tbl.lock); for (n=arp_tbl.hash_buckets[i]; n; n=n->next) { struct device *dev = n->dev; int hatype = dev->type; @@ -979,17 +980,14 @@ if (!(n->nud_state&~NUD_NOARP)) continue; - /* I'd get great pleasure deleting - this ugly code. Let's output it in hexadecimal format. - "arp" utility will eventually repaired --ANK - */ -#if 1 /* UGLY CODE */ + read_lock(&n->lock); + /* * Convert hardware address to XX:XX:XX:XX ... form. */ #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) - strcpy(hbuffer,ax2asc((ax25_address *)n->ha)); + ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k=0,j=0;kaddr_len;j++) { @@ -998,37 +996,33 @@ hbuffer[k++]=':'; } hbuffer[--k]=0; - + #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) } #endif -#else - if ((neigh->nud_state&NUD_VALID) && dev->addr_len) { - int j; - for (j=0; j < dev->addr_len; j++) - sprintf(hbuffer+2*j, "%02x", neigh->ha[j]); - } else - sprintf(hbuffer, "0"); -#endif size = sprintf(buffer+len, "%-17s0x%-10x0x%-10x%s", - in_ntoa(*(u32*)n->primary_key), + in_ntoa2(*(u32*)n->primary_key, abuf), hatype, arp_state_to_flags(n), hbuffer); size += sprintf(buffer+len+size, " %-17s %s\n", "*", dev->name); + read_unlock(&n->lock); len += size; pos += size; if (pos <= offset) len=0; - if (pos >= offset+length) - goto done; + if (pos >= offset+length) { + read_unlock_bh(&arp_tbl.lock); + goto done; + } } + read_unlock_bh(&arp_tbl.lock); } for (i=0; i<=PNEIGH_HASHMASK; i++) { @@ -1039,7 +1033,7 @@ size = sprintf(buffer+len, "%-17s0x%-10x0x%-10x%s", - in_ntoa(*(u32*)n->key), + in_ntoa2(*(u32*)n->key, abuf), hatype, ATF_PUBL|ATF_PERM, "00:00:00:00:00:00"); @@ -1058,7 +1052,6 @@ } done: - neigh_table_unlock(&arp_tbl); *start = buffer+len-(pos-offset); /* Start of wanted data */ len = pos-offset; /* Start slop */ @@ -1117,14 +1110,13 @@ } -#ifdef CONFIG_AX25_MODULE +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) /* * ax25 -> ASCII conversion */ -char *ax2asc(ax25_address *a) +char *ax2asc2(ax25_address *a, char *buf) { - static char buf[11]; char c, *s; int n; diff -ur ../vger3-990605/linux/net/ipv4/devinet.c linux/net/ipv4/devinet.c --- ../vger3-990605/linux/net/ipv4/devinet.c Tue Jun 1 18:05:19 1999 +++ linux/net/ipv4/devinet.c Sat Jun 5 21:05:59 1999 @@ -659,19 +659,19 @@ in this case. It is importnat that lo is the first interface in dev_base list. */ - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev=dev_base; dev; dev=dev->next) { if ((in_dev=dev->ip_ptr) == NULL) continue; for_primary_ifa(in_dev) { if (ifa->ifa_scope <= scope) { - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return ifa->ifa_local; } } endfor_ifa(in_dev); } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return 0; } @@ -792,7 +792,7 @@ s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; @@ -810,7 +810,7 @@ } } done: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); cb->args[0] = idx; cb->args[1] = ip_idx; @@ -885,13 +885,13 @@ ipv4_devconf.accept_redirects = !on; ipv4_devconf_dflt.forwarding = on; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev = dev->ip_ptr; if (in_dev) in_dev->cnf.forwarding = on; } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); rt_cache_flush(0); diff -ur ../vger3-990605/linux/net/ipv4/fib_frontend.c linux/net/ipv4/fib_frontend.c --- ../vger3-990605/linux/net/ipv4/fib_frontend.c Sun Mar 21 17:29:25 1999 +++ linux/net/ipv4/fib_frontend.c Sat Jun 5 19:52:13 1999 @@ -123,13 +123,11 @@ first = 0; } - /* rtnl_shlock(); -- it is pointless at the moment --ANK */ if (main_table && count > 0) { int n = main_table->tb_get_info(main_table, ptr, first, count); count -= n; ptr += n*128; } - /* rtnl_shunlock(); */ len = ptr - *start; if (len >= length) return length; diff -ur ../vger3-990605/linux/net/ipv4/fib_hash.c linux/net/ipv4/fib_hash.c --- ../vger3-990605/linux/net/ipv4/fib_hash.c Fri May 28 19:53:14 1999 +++ linux/net/ipv4/fib_hash.c Sat Jun 5 19:52:13 1999 @@ -246,10 +246,10 @@ fz->fz_mask = inet_make_mask(z); /* Find the first not empty zone with more specific mask */ - write_lock_bh(&fib_hash_lock); for (i=z+1; i<=32; i++) if (table->fn_zones[i]) break; + write_lock_bh(&fib_hash_lock); if (i>32) { /* No more specific masks, we are the first. */ fz->fz_next = table->fn_zone_list; @@ -270,7 +270,7 @@ struct fn_zone *fz; struct fn_hash *t = (struct fn_hash*)tb->tb_data; - read_lock_bh(&fib_hash_lock); + read_lock(&fib_hash_lock); for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { struct fib_node *f; fn_key_t k = fz_key(key->dst, fz); @@ -307,7 +307,7 @@ } err = 1; out: - read_unlock_bh(&fib_hash_lock); + read_unlock(&fib_hash_lock); return err; } @@ -353,7 +353,7 @@ last_resort = NULL; order = -1; - read_lock_bh(&fib_hash_lock); + read_lock(&fib_hash_lock); for (f = fz->fz_hash[0]; f; f = f->fn_next) { struct fib_info *next_fi = FIB_INFO(f); @@ -395,7 +395,7 @@ res->fi = last_resort; fn_hash_last_dflt = last_idx; out: - read_unlock_bh(&fib_hash_lock); + read_unlock(&fib_hash_lock); } #define FIB_SCAN(f, fp) \ @@ -469,7 +469,6 @@ fp = fz_chain_p(key, fz); - write_lock_bh(&fib_hash_lock); /* * Scan list to find the first route with the same destination @@ -574,12 +573,15 @@ */ new_f->fn_next = f; + write_lock_bh(&fib_hash_lock); *fp = new_f; + write_unlock_bh(&fib_hash_lock); fz->fz_nent++; if (del_fp) { f = *del_fp; /* Unlink replaced node */ + write_lock_bh(&fib_hash_lock); *del_fp = f->fn_next; write_unlock_bh(&fib_hash_lock); @@ -590,14 +592,12 @@ fn_free_node(f); fz->fz_nent--; } else { - write_unlock_bh(&fib_hash_lock); rt_cache_flush(-1); } rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req); return 0; out: - write_unlock_bh(&fib_hash_lock); fib_release_info(fi); return err; } @@ -635,13 +635,11 @@ fp = fz_chain_p(key, fz); - write_lock_bh(&fib_hash_lock); FIB_SCAN(f, fp) { if (fn_key_eq(f->fn_key, key)) break; if (fn_key_leq(key, f->fn_key)) { - write_unlock_bh(&fib_hash_lock); return -ESRCH; } } @@ -658,7 +656,6 @@ struct fib_info * fi = FIB_INFO(f); if (f->fn_state&FN_S_ZOMBIE) { - write_unlock_bh(&fib_hash_lock); return -ESRCH; } matched++; @@ -676,6 +673,7 @@ rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); if (matched != 1) { + write_lock_bh(&fib_hash_lock); *del_fp = f->fn_next; write_unlock_bh(&fib_hash_lock); @@ -684,7 +682,6 @@ fn_free_node(f); fz->fz_nent--; } else { - write_unlock_bh(&fib_hash_lock); f->fn_state |= FN_S_ZOMBIE; if (f->fn_state&FN_S_ACCESSED) { f->fn_state &= ~FN_S_ACCESSED; @@ -696,7 +693,6 @@ return 0; } - write_unlock_bh(&fib_hash_lock); return -ESRCH; } @@ -710,7 +706,9 @@ struct fib_info *fi = FIB_INFO(f); if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) { + write_lock_bh(&fib_hash_lock); *fp = f->fn_next; + write_unlock_bh(&fib_hash_lock); fn_free_node(f); found++; @@ -728,7 +726,6 @@ int found = 0; fib_hash_zombies = 0; - write_lock_bh(&fib_hash_lock); for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { int i; int tmp = 0; @@ -737,7 +734,6 @@ fz->fz_nent -= tmp; found += tmp; } - write_unlock_bh(&fib_hash_lock); return found; } @@ -751,7 +747,7 @@ int pos = 0; int n = 0; - read_lock_bh(&fib_hash_lock); + read_lock(&fib_hash_lock); for (fz=table->fn_zone_list; fz; fz = fz->fz_next) { int i; struct fib_node *f; @@ -782,7 +778,7 @@ } } out: - read_unlock_bh(&fib_hash_lock); + read_unlock(&fib_hash_lock); return n; } #endif @@ -845,18 +841,18 @@ struct fn_hash *table = (struct fn_hash*)tb->tb_data; s_m = cb->args[1]; - read_lock_bh(&fib_hash_lock); + read_lock(&fib_hash_lock); for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { if (m < s_m) continue; if (m > s_m) memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0])); if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[1] = m; - read_unlock_bh(&fib_hash_lock); + read_unlock(&fib_hash_lock); return -1; } } - read_unlock_bh(&fib_hash_lock); + read_unlock(&fib_hash_lock); cb->args[1] = m; return skb->len; } diff -ur ../vger3-990605/linux/net/ipv4/fib_rules.c linux/net/ipv4/fib_rules.c --- ../vger3-990605/linux/net/ipv4/fib_rules.c Fri May 28 19:53:16 1999 +++ linux/net/ipv4/fib_rules.c Sat Jun 5 19:52:13 1999 @@ -88,7 +88,6 @@ struct fib_rule *r, **rp; int err = -ESRCH; - write_lock_bh(&fib_rules_lock); for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) && rtm->rtm_src_len == r->r_src_len && @@ -106,14 +105,15 @@ if (r == &local_rule) break; + write_lock_bh(&fib_rules_lock); *rp = r->r_next; + write_unlock_bh(&fib_rules_lock); if (r != &default_rule && r != &main_rule) kfree(r); err = 0; break; } } - write_unlock_bh(&fib_rules_lock); return err; } @@ -192,7 +192,6 @@ memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4); #endif - write_lock_bh(&fib_rules_lock); rp = &fib_rules; if (!new_r->r_preference) { r = fib_rules; @@ -210,6 +209,7 @@ } new_r->r_next = r; + write_lock_bh(&fib_rules_lock); *rp = new_r; write_unlock_bh(&fib_rules_lock); return 0; @@ -255,24 +255,26 @@ { struct fib_rule *r; - write_lock_bh(&fib_rules_lock); for (r=fib_rules; r; r=r->r_next) { - if (r->r_ifindex == dev->ifindex) + if (r->r_ifindex == dev->ifindex) { + write_lock_bh(&fib_rules_lock); r->r_ifindex = -1; + write_unlock_bh(&fib_rules_lock); + } } - write_unlock_bh(&fib_rules_lock); } static void fib_rules_attach(struct device *dev) { struct fib_rule *r; - write_lock_bh(&fib_rules_lock); for (r=fib_rules; r; r=r->r_next) { - if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) { + write_lock_bh(&fib_rules_lock); r->r_ifindex = dev->ifindex; + write_unlock_bh(&fib_rules_lock); + } } - write_unlock_bh(&fib_rules_lock); } int fib_lookup(const struct rt_key *key, struct fib_result *res) @@ -285,7 +287,7 @@ u32 saddr = key->src; FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); - read_lock_bh(&fib_rules_lock); + read_lock(&fib_rules_lock); for (r = fib_rules; r; r=r->r_next) { if (((saddr^r->r_src) & r->r_srcmask) || ((daddr^r->r_dst) & r->r_dstmask) || @@ -305,14 +307,14 @@ policy = r; break; case RTN_UNREACHABLE: - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); return -ENETUNREACH; default: case RTN_BLACKHOLE: - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); return -EINVAL; case RTN_PROHIBIT: - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); return -EACCES; } @@ -322,16 +324,16 @@ if (err == 0) { FRprintk("ok\n"); res->r = policy; - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); return 0; } if (err < 0 && err != -EAGAIN) { - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); return err; } } FRprintk("FAILURE\n"); - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); return -ENETUNREACH; } @@ -418,14 +420,14 @@ int s_idx = cb->args[0]; struct fib_rule *r; - read_lock_bh(&fib_rules_lock); + read_lock(&fib_rules_lock); for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { if (idx < s_idx) continue; if (inet_fill_rule(skb, r, cb) < 0) break; } - read_unlock_bh(&fib_rules_lock); + read_unlock(&fib_rules_lock); cb->args[0] = idx; return skb->len; diff -ur ../vger3-990605/linux/net/ipv4/icmp.c linux/net/ipv4/icmp.c --- ../vger3-990605/linux/net/ipv4/icmp.c Thu Jun 3 18:34:54 1999 +++ linux/net/ipv4/icmp.c Sat Jun 5 22:38:20 1999 @@ -699,8 +699,8 @@ case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { if (net_ratelimit()) - printk(KERN_INFO "ICMP: %s: fragmentation needed and DF set.\n", - in_ntoa(iph->daddr)); + printk(KERN_INFO "ICMP: %d.%d.%d.%d: fragmentation needed and DF set.\n", + NIPQUAD(iph->daddr)); } else { unsigned short new_mtu; new_mtu = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); @@ -711,7 +711,7 @@ break; case ICMP_SR_FAILED: if (net_ratelimit()) - printk(KERN_INFO "ICMP: %s: Source Route Failed.\n", in_ntoa(iph->daddr)); + printk(KERN_INFO "ICMP: %d.%d.%d.%d: Source Route Failed.\n", NIPQUAD(iph->daddr)); break; default: break; @@ -741,8 +741,8 @@ if (inet_addr_type(iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) - printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n", - in_ntoa(skb->nh.iph->saddr)); + printk(KERN_WARNING "%d.%d.%d.%d sent an invalid ICMP error to a broadcast.\n", + NIPQUAD(skb->nh.iph->saddr)); return; } } diff -ur ../vger3-990605/linux/net/ipv4/igmp.c linux/net/ipv4/igmp.c --- ../vger3-990605/linux/net/ipv4/igmp.c Fri May 28 19:53:18 1999 +++ linux/net/ipv4/igmp.c Sun Jun 6 18:45:04 1999 @@ -97,6 +97,15 @@ #include #endif +/* Big mc list lock for all the devices */ +static rwlock_t ip_mc_lock = RW_LOCK_UNLOCKED; +/* Big mc list semaphore for all the sockets. + We do not refer to this list in IP data paths or from BH, + so that semaphore is OK. + */ +DECLARE_MUTEX(ip_sk_mc_sem); + + #define IP_MAX_MEMBERSHIPS 20 #ifdef CONFIG_IP_MULTICAST @@ -216,6 +225,8 @@ struct in_device *in_dev = im->interface; int err; + read_lock(&ip_mc_lock); + im->tm_running=0; if (IGMP_V1_SEEN(in_dev)) @@ -234,6 +245,7 @@ igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); } im->reporter = 1; + read_unlock(&ip_mc_lock); } static void igmp_heard_report(struct in_device *in_dev, u32 group) @@ -245,14 +257,16 @@ if (LOCAL_MCAST(group)) return; + read_lock(&ip_mc_lock); for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (im->multiaddr == group) { igmp_stop_timer(im); im->reporter = 0; im->unsolicit_count = 0; - return; + break; } } + read_unlock(&ip_mc_lock); } static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time, @@ -281,6 +295,7 @@ * - Use the igmp->igmp_code field as the maximum * delay possible */ + read_lock(&ip_mc_lock); for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (group && group != im->multiaddr) continue; @@ -291,6 +306,7 @@ igmp_stop_timer(im); igmp_start_timer(im, max_delay); } + read_unlock(&ip_mc_lock); } int igmp_rcv(struct sk_buff *skb, unsigned short len) @@ -380,9 +396,7 @@ if (LOCAL_MCAST(im->multiaddr)) return; - start_bh_atomic(); igmp_stop_timer(im); - end_bh_atomic(); if (im->reporter && !IGMP_V1_SEEN(im->interface)) igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); @@ -400,9 +414,7 @@ if (LOCAL_MCAST(im->multiaddr)) return; - start_bh_atomic(); igmp_start_timer(im, IGMP_Initial_Report_Delay); - end_bh_atomic(); #endif } @@ -422,16 +434,17 @@ im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + write_lock_bh(&ip_mc_lock); for (i=in_dev->mc_list; i; i=i->next) { if (i->multiaddr == addr) { i->users++; if (im) kfree(im); - return; + goto out; } } if (!im) - return; + goto out; im->users=1; im->interface=in_dev; im->multiaddr=addr; @@ -447,9 +460,13 @@ im->next=in_dev->mc_list; in_dev->mc_list=im; igmp_group_added(im); + write_unlock_bh(&ip_mc_lock); if (in_dev->dev->flags & IFF_UP) ip_rt_multicast_event(in_dev); return; +out: + write_unlock_bh(&ip_mc_lock); + return; } /* @@ -458,22 +475,27 @@ int ip_mc_dec_group(struct in_device *in_dev, u32 addr) { + int err = -ESRCH; struct ip_mc_list *i, **ip; + write_lock_bh(&ip_mc_lock); for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { if (i->multiaddr==addr) { if (--i->users == 0) { *ip = i->next; - synchronize_bh(); - igmp_group_dropped(i); + + write_unlock_bh(&ip_mc_lock); if (in_dev->dev->flags & IFF_UP) ip_rt_multicast_event(in_dev); kfree_s(i, sizeof(*i)); + return 0; } - return 0; + err = 0; + break; } } + write_unlock_bh(&ip_mc_lock); return -ESRCH; } @@ -483,8 +505,10 @@ { struct ip_mc_list *i; + read_lock_bh(&ip_mc_lock); for (i=in_dev->mc_list; i; i=i->next) igmp_group_dropped(i); + read_unlock_bh(&ip_mc_lock); ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } @@ -497,8 +521,10 @@ ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + read_lock_bh(&ip_mc_lock); for (i=in_dev->mc_list; i; i=i->next) igmp_group_added(i); + read_unlock_bh(&ip_mc_lock); } /* @@ -509,11 +535,13 @@ { struct ip_mc_list *i; + write_lock_bh(&ip_mc_lock); while ((i = in_dev->mc_list) != NULL) { in_dev->mc_list = i->next; igmp_group_dropped(i); kfree_s(i, sizeof(*i)); } + write_unlock_bh(&ip_mc_lock); } static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) @@ -570,6 +598,7 @@ iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); err = -EADDRINUSE; + down(&ip_sk_mc_sem); for (i=sk->ip_mc_list; i; i=i->next) { if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { /* New style additions are reference counted */ @@ -577,13 +606,13 @@ i->count++; err = 0; } - goto done; + goto done_unlock; } count++; } err = -ENOBUFS; if (iml == NULL || count >= sysctl_igmp_max_memberships) - goto done; + goto done_unlock; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next = sk->ip_mc_list; iml->count = 1; @@ -591,6 +620,9 @@ ip_mc_inc_group(in_dev, addr); iml = NULL; err = 0; + +done_unlock: + up(&ip_sk_mc_sem); done: rtnl_shunlock(); if (iml) @@ -606,6 +638,7 @@ { struct ip_mc_socklist *iml, **imlp; + down(&ip_sk_mc_sem); for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) { if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && iml->multi.imr_address.s_addr==imr->imr_address.s_addr && @@ -615,7 +648,7 @@ return 0; *imlp = iml->next; - synchronize_bh(); + up(&ip_sk_mc_sem); in_dev = inetdev_by_index(iml->multi.imr_ifindex); if (in_dev) @@ -624,6 +657,7 @@ return 0; } } + up(&ip_sk_mc_sem); return -EADDRNOTAVAIL; } @@ -635,13 +669,37 @@ { struct ip_mc_socklist *iml; + down(&ip_sk_mc_sem); while ((iml=sk->ip_mc_list) != NULL) { struct in_device *in_dev; sk->ip_mc_list = iml->next; + up(&ip_sk_mc_sem); + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); sock_kfree_s(sk, iml, sizeof(*iml)); + + down(&ip_sk_mc_sem); } + up(&ip_sk_mc_sem); +} + +int ip_check_mc(struct device *dev, u32 mc_addr) +{ + struct in_device *in_dev = dev->ip_ptr; + struct ip_mc_list *im; + + if (in_dev) { + read_lock(&ip_mc_lock); + for (im=in_dev->mc_list; im; im=im->next) { + if (im->multiaddr == mc_addr) { + read_unlock(&ip_mc_lock); + return 1; + } + } + read_unlock(&ip_mc_lock); + } + return 0; } @@ -653,10 +711,10 @@ struct ip_mc_list *im; int len=0; struct device *dev; - + len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); - - read_lock_bh(&dev_base_lock); + + read_lock(&dev_base_lock); for(dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev = dev->ip_ptr; char *querier = "NONE"; @@ -669,6 +727,7 @@ len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n", dev->ifindex, dev->name, dev->mc_count, querier); + read_lock(&ip_mc_lock); for (im = in_dev->mc_list; im; im = im->next) { len+=sprintf(buffer+len, "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", @@ -681,12 +740,15 @@ len=0; begin=pos; } - if(pos>offset+length) + if(pos>offset+length) { + read_unlock(&ip_mc_lock); goto done; + } } + read_unlock(&ip_mc_lock); } done: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); *start=buffer+(offset-begin); len-=(offset-begin); diff -ur ../vger3-990605/linux/net/ipv4/ip_input.c linux/net/ipv4/ip_input.c --- ../vger3-990605/linux/net/ipv4/ip_input.c Mon May 31 17:57:22 1999 +++ linux/net/ipv4/ip_input.c Sat Jun 5 19:52:13 1999 @@ -392,21 +392,17 @@ if (skb->dst == NULL) { if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) goto drop; -#ifdef CONFIG_CPU_IS_SLOW - if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) && - IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { - goto drop; - } -#endif } #ifdef CONFIG_NET_CLS_ROUTE if (skb->dst->tclassid) { u32 idx = skb->dst->tclassid; + write_lock(&ip_rt_acct_lock); ip_rt_acct[idx&0xFF].o_packets++; ip_rt_acct[idx&0xFF].o_bytes+=skb->len; ip_rt_acct[(idx>>16)&0xFF].i_packets++; ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len; + write_unlock(&ip_rt_acct_lock); } #endif diff -ur ../vger3-990605/linux/net/ipv4/ipconfig.c linux/net/ipv4/ipconfig.c --- ../vger3-990605/linux/net/ipv4/ipconfig.c Fri May 28 19:53:28 1999 +++ linux/net/ipv4/ipconfig.c Sat Jun 5 21:04:38 1999 @@ -112,7 +112,7 @@ unsigned short oflags; last = &ic_first_dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && @@ -144,7 +144,7 @@ DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able)); } } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); *last = NULL; diff -ur ../vger3-990605/linux/net/ipv4/ipmr.c linux/net/ipv4/ipmr.c --- ../vger3-990605/linux/net/ipv4/ipmr.c Thu Jun 3 18:35:01 1999 +++ linux/net/ipv4/ipmr.c Sun Jun 6 21:34:06 1999 @@ -23,6 +23,8 @@ * Brad Parker : Better behaviour on mrouted upcall * overflow. * Carlos Picoto : PIMv1 Support + * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header + * Relax this requrement to work with older peers. * */ @@ -431,7 +433,7 @@ skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; } - err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); } else #endif ip_mr_forward(skb, cache, 0); @@ -1343,7 +1345,8 @@ pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || (pim->flags&PIM_NULL_REGISTER) || reg_dev == NULL || - ip_compute_csum((void *)pim, len)) { + (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && + ip_compute_csum((void *)pim, len))) { kfree_skb(skb); return -EINVAL; } diff -ur ../vger3-990605/linux/net/ipv4/route.c linux/net/ipv4/route.c --- ../vger3-990605/linux/net/ipv4/route.c Fri May 28 19:53:40 1999 +++ linux/net/ipv4/route.c Sat Jun 5 19:52:13 1999 @@ -1996,6 +1996,7 @@ #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct ip_rt_acct[256]; +rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED; #ifdef CONFIG_PROC_FS static int ip_rt_acct_read(char *buffer, char **start, off_t offset, @@ -2008,9 +2009,9 @@ *eof = 1; } if (length > 0) { - start_bh_atomic(); + read_lock_bh(&ip_rt_acct_lock); memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length); - end_bh_atomic(); + read_unlock_bh(&ip_rt_acct_lock); return length; } return 0; diff -ur ../vger3-990605/linux/net/ipv4/utils.c linux/net/ipv4/utils.c --- ../vger3-990605/linux/net/ipv4/utils.c Sun Dec 14 00:53:03 1997 +++ linux/net/ipv4/utils.c Sat Jun 5 20:07:43 1999 @@ -57,6 +57,11 @@ return(buff); } +char *in_ntoa2(__u32 in, char *buff) +{ + sprintf(buff, "%d.%d.%d.%d", NIPQUAD(in)); + return buff; +} /* * Convert an ASCII string to binary IP. diff -ur ../vger3-990605/linux/net/ipv6/addrconf.c linux/net/ipv6/addrconf.c --- ../vger3-990605/linux/net/ipv6/addrconf.c Fri May 28 19:54:20 1999 +++ linux/net/ipv6/addrconf.c Sat Jun 5 20:32:51 1999 @@ -100,9 +100,7 @@ 1. The result of inet6_add_addr() is used only inside lock or from bh_atomic context. - 2. inet6_get_lladdr() is used only from bh protected context. - - 3. The result of ipv6_chk_addr() is not used outside of bh protected context. + 2. The result of ipv6_chk_addr() is not used outside of bh protected context. */ static __inline__ void addrconf_lock(void) @@ -463,7 +461,7 @@ return err; } -struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev) +int ipv6_get_lladdr(struct device *dev, struct in6_addr *addr) { struct inet6_ifaddr *ifp = NULL; struct inet6_dev *idev; @@ -471,12 +469,15 @@ if ((idev = ipv6_get_idev(dev)) != NULL) { addrconf_lock(); for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == IFA_LINK) - break; + if (ifp->scope == IFA_LINK) { + ipv6_addr_copy(addr, &ifp->addr); + addrconf_unlock(); + return 0; + } } addrconf_unlock(); } - return ifp; + return -EADDRNOTAVAIL; } /* @@ -982,7 +983,7 @@ return; } - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->ip_ptr && (dev->flags & IFF_UP)) { struct in_device * in_dev = dev->ip_ptr; @@ -1001,7 +1002,6 @@ flag |= IFA_HOST; } - read_unlock_bh(&dev_base_lock); addrconf_lock(); ifp = ipv6_add_addr(idev, &addr, flag); if (ifp) { @@ -1013,11 +1013,10 @@ ipv6_ifa_notify(RTM_NEWADDR, ifp); } addrconf_unlock(); - read_lock_bh(&dev_base_lock); } } } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); } static void init_loopback(struct device *dev) @@ -1846,12 +1845,11 @@ struct device *dev; /* This takes sense only during module load. */ - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { if (!(dev->flags&IFF_UP)) continue; - read_unlock_bh(&dev_base_lock); switch (dev->type) { case ARPHRD_LOOPBACK: init_loopback(dev); @@ -1862,9 +1860,8 @@ default: /* Ignore all other */ } - read_lock_bh(&dev_base_lock); } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); #endif #ifdef CONFIG_PROC_FS diff -ur ../vger3-990605/linux/net/ipv6/ip6_output.c linux/net/ipv6/ip6_output.c --- ../vger3-990605/linux/net/ipv6/ip6_output.c Thu Jun 3 18:35:11 1999 +++ linux/net/ipv6/ip6_output.c Sat Jun 5 19:52:13 1999 @@ -75,19 +75,10 @@ } if (hh) { -#ifdef __alpha__ - /* Alpha has disguisting memcpy. Help it. */ - u64 *aligned_hdr = (u64*)(skb->data - 16); - u64 *aligned_hdr0 = hh->hh_data; - read_lock_irq(&hh->hh_lock); - aligned_hdr[0] = aligned_hdr0[0]; - aligned_hdr[1] = aligned_hdr0[1]; -#else - read_lock_irq(&hh->hh_lock); + read_lock_bh(&hh->hh_lock); memcpy(skb->data - 16, hh->hh_data, 16); -#endif - read_unlock_irq(&hh->hh_lock); - skb_push(skb, dev->hard_header_len); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); return hh->hh_output(skb); } else if (dst->neighbour) return dst->neighbour->output(skb); diff -ur ../vger3-990605/linux/net/ipv6/mcast.c linux/net/ipv6/mcast.c --- ../vger3-990605/linux/net/ipv6/mcast.c Thu Jun 3 18:35:12 1999 +++ linux/net/ipv6/mcast.c Sun Jun 6 18:45:04 1999 @@ -53,6 +53,11 @@ #define MDBG(x) #endif +/* Big mc list lock for all the devices */ +static rwlock_t ipv6_mc_lock = RW_LOCK_UNLOCKED; +/* Big mc list lock for all the sockets */ +static rwlock_t ipv6_sk_mc_lock = RW_LOCK_UNLOCKED; + static struct socket *igmp6_socket; static void igmp6_join_group(struct ifmcaddr6 *ma); @@ -115,8 +120,10 @@ return err; } + write_lock_bh(&ipv6_sk_mc_lock); mc_lst->next = np->ipv6_mc_list; np->ipv6_mc_list = mc_lst; + write_unlock_bh(&ipv6_sk_mc_lock); return 0; } @@ -129,13 +136,14 @@ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6_mc_socklist *mc_lst, **lnk; + write_lock_bh(&ipv6_sk_mc_lock); for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) { if (mc_lst->ifindex == ifindex && ipv6_addr_cmp(&mc_lst->addr, addr) == 0) { struct device *dev; *lnk = mc_lst->next; - synchronize_bh(); + write_unlock_bh(&ipv6_sk_mc_lock); if ((dev = dev_get_by_index(ifindex)) != NULL) ipv6_dev_mc_dec(dev, &mc_lst->addr); @@ -143,6 +151,7 @@ return 0; } } + write_unlock_bh(&ipv6_sk_mc_lock); return -ENOENT; } @@ -152,15 +161,38 @@ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6_mc_socklist *mc_lst; + write_lock_bh(&ipv6_sk_mc_lock); while ((mc_lst = np->ipv6_mc_list) != NULL) { - struct device *dev = dev_get_by_index(mc_lst->ifindex); + struct device *dev; + + np->ipv6_mc_list = mc_lst->next; + write_unlock_bh(&ipv6_sk_mc_lock); + dev = dev_get_by_index(mc_lst->ifindex); if (dev) ipv6_dev_mc_dec(dev, &mc_lst->addr); - np->ipv6_mc_list = mc_lst->next; sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + + write_lock_bh(&ipv6_sk_mc_lock); } + write_unlock_bh(&ipv6_sk_mc_lock); +} + +int inet6_mc_check(struct sock *sk, struct in6_addr *addr) +{ + struct ipv6_mc_socklist *mc; + + read_lock(&ipv6_sk_mc_lock); + for (mc = sk->net_pinfo.af_inet6.ipv6_mc_list; mc; mc=mc->next) { + if (ipv6_addr_cmp(&mc->addr, addr) == 0) { + read_unlock(&ipv6_sk_mc_lock); + return 1; + } + } + read_unlock(&ipv6_sk_mc_lock); + + return 0; } static int igmp6_group_added(struct ifmcaddr6 *mc) @@ -210,9 +242,11 @@ hash = ipv6_addr_hash(addr); + write_lock_bh(&ipv6_mc_lock); for (mc = inet6_mcast_lst[hash]; mc; mc = mc->next) { if (ipv6_addr_cmp(&mc->mca_addr, addr) == 0 && mc->dev == dev) { atomic_inc(&mc->mca_users); + write_unlock_bh(&ipv6_mc_lock); return 0; } } @@ -223,8 +257,10 @@ mc = kmalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); - if (mc == NULL) + if (mc == NULL) { + write_unlock_bh(&ipv6_mc_lock); return -ENOMEM; + } memset(mc, 0, sizeof(struct ifmcaddr6)); mc->mca_timer.function = igmp6_timer_handler; @@ -242,6 +278,8 @@ igmp6_group_added(mc); + write_unlock_bh(&ipv6_mc_lock); + return 0; } @@ -257,7 +295,6 @@ for (lnk = &idev->mc_list; (iter = *lnk) != NULL; lnk = &iter->if_next) { if (iter == ma) { *lnk = iter->if_next; - synchronize_bh(); return; } } @@ -274,20 +311,22 @@ hash = ipv6_addr_hash(addr); + write_lock_bh(&ipv6_mc_lock); for (lnk = &inet6_mcast_lst[hash]; (ma=*lnk) != NULL; lnk = &ma->next) { if (ipv6_addr_cmp(&ma->mca_addr, addr) == 0 && ma->dev == dev) { if (atomic_dec_and_test(&ma->mca_users)) { igmp6_group_dropped(ma); *lnk = ma->next; - synchronize_bh(); ipv6_mca_remove(dev, ma); kfree(ma); } + write_unlock_bh(&ipv6_mc_lock); return 0; } } + write_unlock_bh(&ipv6_mc_lock); return -ENOENT; } @@ -302,10 +341,14 @@ hash = ipv6_addr_hash(addr); + read_lock_bh(&ipv6_mc_lock); for (mc = inet6_mcast_lst[hash]; mc; mc=mc->next) { - if (mc->dev == dev && ipv6_addr_cmp(&mc->mca_addr, addr) == 0) + if (mc->dev == dev && ipv6_addr_cmp(&mc->mca_addr, addr) == 0) { + read_unlock_bh(&ipv6_mc_lock); return 1; + } } + read_unlock_bh(&ipv6_mc_lock); return 0; } @@ -364,11 +407,14 @@ if (idev == NULL) return 0; + read_lock(&ipv6_mc_lock); for (ma = idev->mc_list; ma; ma=ma->if_next) igmp6_group_queried(ma, resptime); + read_unlock(&ipv6_mc_lock); } else { int hash = ipv6_addr_hash(addrp); + read_lock(&ipv6_mc_lock); for (ma = inet6_mcast_lst[hash]; ma; ma=ma->next) { if (ma->dev == skb->dev && ipv6_addr_cmp(addrp, &ma->mca_addr) == 0) { @@ -376,6 +422,7 @@ break; } } + read_unlock(&ipv6_mc_lock); } return 0; @@ -410,6 +457,7 @@ hash = ipv6_addr_hash(addrp); + read_lock(&ipv6_mc_lock); for (ma = inet6_mcast_lst[hash]; ma; ma=ma->next) { if ((ma->dev == dev) && ipv6_addr_cmp(&ma->mca_addr, addrp) == 0) { if (ma->mca_flags & MAF_TIMER_RUNNING) { @@ -421,6 +469,7 @@ break; } } + read_unlock(&ipv6_mc_lock); return 0; } @@ -430,9 +479,9 @@ struct sock *sk = igmp6_socket->sk; struct sk_buff *skb; struct icmp6hdr *hdr; - struct inet6_ifaddr *ifp; struct in6_addr *snd_addr; struct in6_addr *addrp; + struct in6_addr addr_buf; struct in6_addr all_routers; int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, @@ -461,9 +510,7 @@ dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len); } - ifp = ipv6_get_lladdr(dev); - - if (ifp == NULL) { + if (ipv6_get_lladdr(dev, &addr_buf)) { #if MCAST_DEBUG >= 1 printk(KERN_DEBUG "igmp6: %s no linklocal address\n", dev->name); @@ -471,7 +518,7 @@ return; } - ip6_nd_hdr(sk, skb, dev, &ifp->addr, snd_addr, NEXTHDR_HOP, payload_len); + ip6_nd_hdr(sk, skb, dev, &addr_buf, snd_addr, NEXTHDR_HOP, payload_len); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); @@ -482,7 +529,7 @@ addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr)); ipv6_addr_copy(addrp, addr); - hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, snd_addr, len, + hdr->icmp6_cksum = csum_ipv6_magic(&addr_buf, snd_addr, len, IPPROTO_ICMPV6, csum_partial((__u8 *) hdr, len, 0)); @@ -504,7 +551,6 @@ if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK))) return; - start_bh_atomic(); igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT); delay = net_random() % IGMP6_UNSOLICITED_IVAL; @@ -515,7 +561,6 @@ add_timer(&ma->mca_timer); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; - end_bh_atomic(); } static void igmp6_leave_group(struct ifmcaddr6 *ma) @@ -527,22 +572,22 @@ if ((addr_type & IPV6_ADDR_LINKLOCAL)) return; - start_bh_atomic(); if (ma->mca_flags & MAF_LAST_REPORTER) igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REDUCTION); if (ma->mca_flags & MAF_TIMER_RUNNING) del_timer(&ma->mca_timer); - end_bh_atomic(); } void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + read_lock(&ipv6_mc_lock); ma->mca_flags |= MAF_LAST_REPORTER; igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT); ma->mca_flags &= ~MAF_TIMER_RUNNING; + read_unlock(&ipv6_mc_lock); } /* Device going down */ @@ -554,8 +599,10 @@ /* Withdraw multicast list */ + read_lock_bh(&ipv6_mc_lock); for (i = idev->mc_list; i; i=i->if_next) igmp6_group_dropped(i); + read_unlock_bh(&ipv6_mc_lock); /* Delete all-nodes address. */ @@ -577,8 +624,10 @@ /* Install multicast list, except for all-nodes (already installed) */ + read_lock(&ipv6_mc_lock); for (i = idev->mc_list; i; i=i->if_next) igmp6_group_added(i); + read_unlock(&ipv6_mc_lock); } /* @@ -590,6 +639,7 @@ int hash; struct ifmcaddr6 *i, **lnk; + write_lock_bh(&ipv6_mc_lock); while ((i = idev->mc_list) != NULL) { idev->mc_list = i->if_next; @@ -598,13 +648,13 @@ for (lnk = &inet6_mcast_lst[hash]; *lnk; lnk = &(*lnk)->next) { if (*lnk == i) { *lnk = i->next; - synchronize_bh(); break; } } igmp6_group_dropped(i); kfree(i); } + write_unlock_bh(&ipv6_mc_lock); } #ifdef CONFIG_PROC_FS @@ -616,13 +666,14 @@ int len=0; struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { struct inet6_dev *idev; if ((idev = ipv6_get_idev(dev)) == NULL) continue; + read_lock_bh(&ipv6_mc_lock); for (im = idev->mc_list; im; im = im->if_next) { int i; @@ -642,14 +693,17 @@ len=0; begin=pos; } - if (pos > offset+length) + if (pos > offset+length) { + read_unlock_bh(&ipv6_mc_lock); goto done; + } } + read_unlock_bh(&ipv6_mc_lock); } *eof = 1; done: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); *start=buffer+(offset-begin); len-=(offset-begin); diff -ur ../vger3-990605/linux/net/ipv6/ndisc.c linux/net/ipv6/ndisc.c --- ../vger3-990605/linux/net/ipv6/ndisc.c Sat May 1 16:31:52 1999 +++ linux/net/ipv6/ndisc.c Sun Jun 6 18:45:07 1999 @@ -268,14 +268,21 @@ ndisc_mc_map(daddr, ha, dev, 1); h_dest = ha; } else if (neigh) { - h_dest = neigh->ha; + read_lock_bh(&neigh->lock); + if (neigh->nud_state&NUD_VALID) { + memcpy(ha, neigh->ha, dev->addr_len); + h_dest = ha; + } + read_unlock_bh(&neigh->lock); } else { neigh = neigh_lookup(&nd_tbl, daddr, dev); if (neigh) { + read_lock_bh(&neigh->lock); if (neigh->nud_state&NUD_VALID) { memcpy(ha, neigh->ha, dev->addr_len); h_dest = ha; } + read_unlock_bh(&neigh->lock); neigh_release(neigh); } } @@ -362,6 +369,7 @@ struct sock *sk = ndisc_socket->sk; struct sk_buff *skb; struct nd_msg *msg; + struct in6_addr addr_buf; int len; int err; @@ -377,13 +385,8 @@ } if (saddr == NULL) { - struct inet6_ifaddr *ifa; - - /* use link local address */ - ifa = ipv6_get_lladdr(dev); - - if (ifa) - saddr = &ifa->addr; + if (!ipv6_get_lladdr(dev, &addr_buf)) + saddr = &addr_buf; } if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { @@ -501,13 +504,15 @@ kfree_skb(skb); } +/* Called with locked neigh: either read or both */ + static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; - int probes = neigh->probes; + int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr(&skb->nh.ipv6h->saddr, dev, 0)) saddr = &skb->nh.ipv6h->saddr; @@ -774,8 +779,8 @@ struct sock *sk = ndisc_socket->sk; int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); struct sk_buff *buff; - struct inet6_ifaddr *ifp; struct icmp6hdr *icmph; + struct in6_addr saddr_buf; struct in6_addr *addrp; struct device *dev; struct rt6_info *rt; @@ -817,12 +822,10 @@ rd_len &= ~0x7; len += rd_len; - ifp = ipv6_get_lladdr(dev); - - if (ifp == NULL) { - ND_PRINTK1("redirect: no link_local addr for dev\n"); - return; - } + if (ipv6_get_lladdr(dev, &saddr_buf)) { + ND_PRINTK1("redirect: no link_local addr for dev\n"); + return; + } buff = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15, 0, 0, &err); @@ -838,7 +841,7 @@ return; } - ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr, + ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, IPPROTO_ICMPV6, len); icmph = (struct icmp6hdr *) skb_put(buff, len); @@ -875,7 +878,7 @@ memcpy(opt, skb->nh.ipv6h, rd_len - 8); - icmph->icmp6_cksum = csum_ipv6_magic(&ifp->addr, &skb->nh.ipv6h->saddr, + icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &skb->nh.ipv6h->saddr, len, IPPROTO_ICMPV6, csum_partial((u8 *) icmph, len, 0)); @@ -1034,7 +1037,7 @@ ifp->idev->dev->name); return 0; } - neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 0); + neigh = neigh_lookup(&nd_tbl, &msg->target, skb->dev); if (neigh) { if (neigh->flags & NTF_ROUTER) { @@ -1083,11 +1086,10 @@ unsigned long now = jiffies; int i; - neigh_table_lock(&nd_tbl); - for (i = 0; i <= NEIGH_HASHMASK; i++) { struct neighbour *neigh; + read_lock_bh(&tbl->lock); for (neigh = nd_tbl.hash_buckets[i]; neigh; neigh = neigh->next) { int j; @@ -1097,6 +1099,7 @@ size += 2; } + read_lock(&neigh->lock); size += sprintf(buffer+len+size, " %02x %02x %02x %02x %08lx %08lx %08x %04x %04x %04x %8s ", i, 128, @@ -1118,19 +1121,21 @@ } else { size += sprintf(buffer+len+size, "000000000000"); } + read_unlock(&neigh->lock); size += sprintf(buffer+len+size, "\n"); len += size; pos += size; if (pos <= offset) len=0; - if (pos >= offset+length) + if (pos >= offset+length) { + read_unlock_bh(&tbl->lock); goto done; } + read_unlock_bh(&tbl->lock); } done: - neigh_table_unlock(&nd_tbl); *start = buffer+len-(pos-offset); /* Start of wanted data */ len = pos-offset; /* Start slop */ diff -ur ../vger3-990605/linux/net/ipv6/raw.c linux/net/ipv6/raw.c --- ../vger3-990605/linux/net/ipv6/raw.c Fri May 28 19:54:34 1999 +++ linux/net/ipv6/raw.c Sat Jun 5 21:03:20 1999 @@ -99,17 +99,6 @@ SOCKHASH_UNLOCK_WRITE(); } -static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr) -{ - struct ipv6_mc_socklist *mc; - - for (mc = sk->net_pinfo.af_inet6.ipv6_mc_list; mc; mc=mc->next) { - if (ipv6_addr_cmp(&mc->addr, addr) == 0) - return 1; - } - - return 0; -} /* Grumble... icmp and ip_input want to get at this... */ struct sock *raw_v6_lookup(struct sock *sk, unsigned short num, diff -ur ../vger3-990605/linux/net/ipv6/route.c linux/net/ipv6/route.c --- ../vger3-990605/linux/net/ipv6/route.c Sun Mar 21 17:33:57 1999 +++ linux/net/ipv6/route.c Sun Jun 6 18:49:26 1999 @@ -1607,7 +1607,7 @@ return 0; } -static int fib6_dump_done(struct netlink_callback *cb) +static void fib6_dump_end(struct netlink_callback *cb) { struct fib6_walker_t *w = (void*)cb->args[0]; @@ -1622,6 +1622,11 @@ cb->done = (void*)cb->args[1]; cb->args[1] = 0; } +} + +static int fib6_dump_done(struct netlink_callback *cb) +{ + fib6_dump_end(cb); return cb->done(cb); } @@ -1668,11 +1673,15 @@ if (res <= 0 && skb->len == 0) RT6_TRACE("%p>dump end\n", w); #endif + res = res < 0 ? res : skb->len; /* res < 0 is an error. (really, impossible) res == 0 means that dump is complete, but skb still can contain data. res > 0 dump is not complete, but frame is full. */ - return res < 0 ? res : skb->len; + /* Destroy walker, if dump of this table is complete. */ + if (res <= 0) + fib6_dump_end(cb); + return res; } int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) diff -ur ../vger3-990605/linux/net/ipv6/udp.c linux/net/ipv6/udp.c --- ../vger3-990605/linux/net/ipv6/udp.c Fri May 28 19:54:46 1999 +++ linux/net/ipv6/udp.c Sun Jun 6 21:34:06 1999 @@ -499,18 +504,6 @@ } ipv6_statistics.Ip6InDelivers++; udp_stats_in6.UdpInDatagrams++; - return 0; -} - -static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr) -{ - struct ipv6_mc_socklist *mc; - - for (mc = sk->net_pinfo.af_inet6.ipv6_mc_list; mc; mc=mc->next) { - if (ipv6_addr_cmp(&mc->addr, addr) == 0) - return 1; - } - return 0; } diff -ur ../vger3-990605/linux/net/netrom/nr_route.c linux/net/netrom/nr_route.c --- ../vger3-990605/linux/net/netrom/nr_route.c Fri May 28 19:55:04 1999 +++ linux/net/netrom/nr_route.c Sat Jun 5 20:35:27 1999 @@ -564,13 +564,13 @@ { struct device *dev, *first = NULL; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return first; } @@ -582,13 +582,13 @@ { struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) goto out; } out: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return dev; } diff -ur ../vger3-990605/linux/net/packet/af_packet.c linux/net/packet/af_packet.c --- ../vger3-990605/linux/net/packet/af_packet.c Sun Mar 21 17:34:21 1999 +++ linux/net/packet/af_packet.c Sat Jun 5 19:52:13 1999 @@ -286,26 +286,27 @@ else return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ + dev_lock_list(); + /* * Find the device first to size check it */ saddr->spkt_device[13] = 0; dev = dev_get(saddr->spkt_device); - if (dev == NULL) - { - return(-ENODEV); - } + err = -ENODEV; + if (dev == NULL) + goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ - if(len>dev->mtu+dev->hard_header_len) - return -EMSGSIZE; + err = -EMSGSIZE; + if(len>dev->mtu+dev->hard_header_len) + goto out_unlock; - dev_lock_list(); err = -ENOBUFS; skb = sock_wmalloc(sk, len+dev->hard_header_len+15, 0, GFP_KERNEL); @@ -351,8 +352,8 @@ * Now send it */ - dev_unlock_list(); dev_queue_xmit(skb); + dev_unlock_list(); return(len); out_free: @@ -455,16 +456,18 @@ addr = saddr->sll_addr; } + dev_lock_list(); dev = dev_get_by_index(ifindex); + err = -ENXIO; if (dev == NULL) - return -ENXIO; + goto out_unlock; if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; + err = -EMSGSIZE; if (len > dev->mtu+reserve) - return -EMSGSIZE; + goto out_unlock; - dev_lock_list(); skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, msg->msg_flags & MSG_DONTWAIT, &err); @@ -501,8 +504,8 @@ * Now send it */ - dev_unlock_list(); dev_queue_xmit(skb); + dev_unlock_list(); return(len); out_free: diff -ur ../vger3-990605/linux/net/rose/rose_route.c linux/net/rose/rose_route.c --- ../vger3-990605/linux/net/rose/rose_route.c Fri May 28 19:55:07 1999 +++ linux/net/rose/rose_route.c Sat Jun 5 20:35:10 1999 @@ -543,13 +543,13 @@ { struct device *dev, *first = NULL; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return first; } @@ -561,13 +561,13 @@ { struct device *dev; - read_lock_bh(&dev_base_lock); + read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) goto out; } out: - read_unlock_bh(&dev_base_lock); + read_unlock(&dev_base_lock); return dev; } diff -ur ../vger3-990605/linux/net/sched/cls_api.c linux/net/sched/cls_api.c --- ../vger3-990605/linux/net/sched/cls_api.c Mon Apr 5 19:35:59 1999 +++ linux/net/sched/cls_api.c Sun Jun 6 22:52:49 1999 @@ -39,20 +39,24 @@ static struct tcf_proto_ops *tcf_proto_base; +/* Protects list of registered TC modules. It is pure SMP lock. */ +static rwlock_t cls_mod_lock = RW_LOCK_UNLOCKED; /* Find classifier type by string name */ struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) { - struct tcf_proto_ops *t; + struct tcf_proto_ops *t = NULL; if (kind) { + read_lock(&cls_mod_lock); for (t = tcf_proto_base; t; t = t->next) { if (rtattr_strcmp(kind, t->kind) == 0) - return t; + break; } + read_unlock(&cls_mod_lock); } - return NULL; + return t; } /* Register(unregister) new classifier type */ @@ -61,12 +65,17 @@ { struct tcf_proto_ops *t, **tp; - for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) - if (strcmp(ops->kind, t->kind) == 0) + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) { + if (strcmp(ops->kind, t->kind) == 0) { + write_unlock(&cls_mod_lock); return -EEXIST; + } + } ops->next = NULL; *tp = ops; + write_unlock(&cls_mod_lock); return 0; } @@ -74,13 +83,17 @@ { struct tcf_proto_ops *t, **tp; + write_lock(&cls_mod_lock); for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) if (t == ops) break; - if (!t) + if (!t) { + write_unlock(&cls_mod_lock); return -ENOENT; + } *tp = t->next; + write_unlock(&cls_mod_lock); return 0; } @@ -217,8 +230,12 @@ kfree(tp); goto errout; } + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); tp->next = *back; *back = tp; + spin_unlock_bh(&dev->queue_lock); + write_unlock(&qdisc_tree_lock); } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) goto errout; @@ -226,8 +243,11 @@ if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); *back = tp->next; - synchronize_bh(); + spin_unlock_bh(&dev->queue_lock); + write_unlock(&qdisc_tree_lock); tp->ops->destroy(tp); kfree(tp); @@ -344,12 +364,16 @@ return skb->len; if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) return skb->len; + + read_lock(&qdisc_tree_lock); if (!tcm->tcm_parent) q = dev->qdisc_sleeping; else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); - if (q == NULL) + if (q == NULL) { + read_unlock(&qdisc_tree_lock); return skb->len; + } if ((cops = q->ops->cl_ops) == NULL) goto errout; if (TC_H_MIN(tcm->tcm_parent)) { @@ -400,6 +424,7 @@ if (cl) cops->put(q, cl); + read_unlock(&qdisc_tree_lock); return skb->len; } diff -ur ../vger3-990605/linux/net/sched/cls_fw.c linux/net/sched/cls_fw.c --- ../vger3-990605/linux/net/sched/cls_fw.c Sat May 1 16:38:30 1999 +++ linux/net/sched/cls_fw.c Sun Jun 6 22:52:49 1999 @@ -136,7 +136,7 @@ unsigned long cl; head->ht[h] = f->next; - if ((cl = cls_set_class(&f->res.class, 0)) != 0) + if ((cl = __cls_set_class(&f->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE tcf_police_release(f->police); @@ -161,10 +161,11 @@ if (*fp == f) { unsigned long cl; + tcf_tree_lock(tp); *fp = f->next; - synchronize_bh(); + tcf_tree_unlock(tp); - if ((cl = cls_set_class(&f->res.class, 0)) != 0) + if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE tcf_police_release(f->police); @@ -203,7 +204,7 @@ f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid); - cl = cls_set_class(&f->res.class, cl); + cl = cls_set_class(tp, &f->res.class, cl); if (cl) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); } @@ -211,8 +212,9 @@ if (tb[TCA_FW_POLICE-1]) { struct tcf_police *police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]); + tcf_tree_lock(tp); police = xchg(&f->police, police); - synchronize_bh(); + tcf_tree_unlock(tp); tcf_police_release(police); } @@ -229,8 +231,9 @@ return -ENOBUFS; memset(head, 0, sizeof(*head)); + tcf_tree_lock(tp); tp->root = head; - synchronize_bh(); + tcf_tree_unlock(tp); } f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); @@ -245,7 +248,7 @@ if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != 4) goto errout; f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); - cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); } #ifdef CONFIG_NET_CLS_POLICE @@ -254,8 +257,9 @@ #endif f->next = head->ht[fw_hash(handle)]; - wmb(); + tcf_tree_lock(tp); head->ht[fw_hash(handle)] = f; + tcf_tree_unlock(tp); *arg = (unsigned long)f; return 0; @@ -335,7 +339,8 @@ rta->rta_len = skb->tail - b; #ifdef CONFIG_NET_CLS_POLICE if (f->police) { - RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats); + if (qdisc_copy_stats(skb, &f->police->stats)) + goto rtattr_failure; } #endif return skb->len; diff -ur ../vger3-990605/linux/net/sched/cls_route.c linux/net/sched/cls_route.c --- ../vger3-990605/linux/net/sched/cls_route.c Mon Apr 5 19:36:00 1999 +++ linux/net/sched/cls_route.c Sun Jun 6 22:52:49 1999 @@ -83,11 +83,11 @@ return id&0xF; } -static void route4_reset_fastmap(struct route4_head *head, u32 id) +static void route4_reset_fastmap(struct device *dev, struct route4_head *head, u32 id) { - start_bh_atomic(); + spin_lock_bh(&dev->queue_lock); memset(head->fastmap, 0, sizeof(head->fastmap)); - end_bh_atomic(); + spin_unlock_bh(&dev->queue_lock); } static void __inline__ @@ -297,7 +297,7 @@ unsigned long cl; b->ht[h2] = f->next; - if ((cl = cls_set_class(&f->res.class, 0)) != 0) + if ((cl = __cls_set_class(&f->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE tcf_police_release(f->police); @@ -329,12 +329,13 @@ if (*fp == f) { unsigned long cl; + tcf_tree_lock(tp); *fp = f->next; - synchronize_bh(); + tcf_tree_unlock(tp); - route4_reset_fastmap(head, f->id); + route4_reset_fastmap(tp->q->dev, head, f->id); - if ((cl = cls_set_class(&f->res.class, 0)) != 0) + if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE @@ -349,8 +350,9 @@ return 0; /* OK, session has no flows */ + tcf_tree_lock(tp); head->table[to_hash(h)] = NULL; - synchronize_bh(); + tcf_tree_unlock(tp); kfree(b); return 0; @@ -387,7 +389,7 @@ unsigned long cl; f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); - cl = cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); if (cl) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); } @@ -395,8 +397,9 @@ if (tb[TCA_ROUTE4_POLICE-1]) { struct tcf_police *police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]); + tcf_tree_lock(tp); police = xchg(&f->police, police); - synchronize_bh(); + tcf_tree_unlock(tp); tcf_police_release(police); } @@ -412,8 +415,9 @@ return -ENOBUFS; memset(head, 0, sizeof(struct route4_head)); + tcf_tree_lock(tp); tp->root = head; - synchronize_bh(); + tcf_tree_unlock(tp); } f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); @@ -475,8 +479,9 @@ goto errout; memset(b, 0, sizeof(*b)); + tcf_tree_lock(tp); head->table[h1] = b; - synchronize_bh(); + tcf_tree_unlock(tp); } f->bkt = b; @@ -489,17 +494,18 @@ goto errout; } - cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); #ifdef CONFIG_NET_CLS_POLICE if (tb[TCA_ROUTE4_POLICE-1]) f->police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]); #endif f->next = f1; - wmb(); + tcf_tree_lock(tp); *ins_f = f; + tcf_tree_unlock(tp); - route4_reset_fastmap(head, f->id); + route4_reset_fastmap(tp->q->dev, head, f->id); *arg = (unsigned long)f; return 0; @@ -589,7 +595,8 @@ rta->rta_len = skb->tail - b; #ifdef CONFIG_NET_CLS_POLICE if (f->police) { - RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats); + if (qdisc_copy_stats(skb, &f->police->stats)) + goto rtattr_failure; } #endif return skb->len; diff -ur ../vger3-990605/linux/net/sched/cls_rsvp.h linux/net/sched/cls_rsvp.h --- ../vger3-990605/linux/net/sched/cls_rsvp.h Sat May 1 16:38:30 1999 +++ linux/net/sched/cls_rsvp.h Sun Jun 6 23:14:37 1999 @@ -282,7 +282,7 @@ unsigned long cl; s->ht[h2] = f->next; - if ((cl = cls_set_class(&f->res.class, 0)) != 0) + if ((cl = __cls_set_class(&f->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE tcf_police_release(f->police); @@ -310,10 +310,11 @@ unsigned long cl; + tcf_tree_lock(tp); *fp = f->next; - synchronize_bh(); + tcf_tree_unlock(tp); - if ((cl = cls_set_class(&f->res.class, 0)) != 0) + if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE @@ -332,8 +333,9 @@ for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; *sp; sp = &(*sp)->next) { if (*sp == s) { + tcf_tree_lock(tp); *sp = s->next; - synchronize_bh(); + tcf_tree_unlock(tp); kfree(s); return 0; @@ -446,7 +448,7 @@ unsigned long cl; f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); - cl = cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); if (cl) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); } @@ -454,8 +456,9 @@ if (tb[TCA_RSVP_POLICE-1]) { struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]); + tcf_tree_lock(tp); police = xchg(&f->police, police); - synchronize_bh(); + tcf_tree_unlock(tp); tcf_police_release(police); } @@ -536,7 +539,7 @@ f->sess = s; if (f->tunnelhdr == 0) - cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); #ifdef CONFIG_NET_CLS_POLICE if (tb[TCA_RSVP_POLICE-1]) f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]); @@ -659,7 +662,8 @@ rta->rta_len = skb->tail - b; #ifdef CONFIG_NET_CLS_POLICE if (f->police) { - RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats); + if (qdisc_copy_stats(skb, &f->police->stats)) + goto rtattr_failure; } #endif return skb->len; diff -ur ../vger3-990605/linux/net/sched/cls_u32.c linux/net/sched/cls_u32.c --- ../vger3-990605/linux/net/sched/cls_u32.c Mon Apr 5 19:36:00 1999 +++ linux/net/sched/cls_u32.c Sun Jun 6 23:16:24 1999 @@ -307,7 +307,7 @@ { unsigned long cl; - if ((cl = cls_set_class(&n->res.class, 0)) != 0) + if ((cl = __cls_set_class(&n->res.class, 0)) != 0) tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); #ifdef CONFIG_NET_CLS_POLICE tcf_police_release(n->police); @@ -326,8 +326,9 @@ if (ht) { for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { if (*kp == key) { + tcf_tree_lock(tp); *kp = key->next; - synchronize_bh(); + tcf_tree_unlock(tp); u32_destroy_key(tp, key); return 0; @@ -346,7 +347,6 @@ for (h=0; h<=ht->divisor; h++) { while ((n = ht->ht[h]) != NULL) { ht->ht[h] = n->next; - synchronize_bh(); u32_destroy_key(tp, n); } @@ -465,8 +465,9 @@ ht_down->refcnt++; } + sch_tree_lock(q); ht_down = xchg(&n->ht_down, ht_down); - synchronize_bh(); + sch_tree_unlock(q); if (ht_down) ht_down->refcnt--; @@ -475,7 +476,9 @@ unsigned long cl; n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); - cl = cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid)); + sch_tree_lock(q); + cl = __cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid)); + sch_tree_unlock(q); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } @@ -483,8 +486,9 @@ if (tb[TCA_U32_POLICE-1]) { struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est); + sch_tree_lock(q); police = xchg(&n->police, police); - synchronize_bh(); + sch_tree_lock(q); tcf_police_release(police); } @@ -682,7 +686,8 @@ rta->rta_len = skb->tail - b; #ifdef CONFIG_NET_CLS_POLICE if (TC_U32_KEY(n->handle) && n->police) { - RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &n->police->stats); + if (qdisc_copy_stats(skb, &n->police->stats)) + goto rtattr_failure; } #endif return skb->len; diff -ur ../vger3-990605/linux/net/sched/estimator.c linux/net/sched/estimator.c --- ../vger3-990605/linux/net/sched/estimator.c Mon Apr 5 19:36:00 1999 +++ linux/net/sched/estimator.c Sun Jun 6 20:29:12 1999 @@ -97,29 +97,38 @@ static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; +/* Estimator array lock */ +static rwlock_t est_lock = RW_LOCK_UNLOCKED; + static void est_timer(unsigned long arg) { int idx = (int)arg; struct qdisc_estimator *e; + read_lock(&est_lock); for (e = elist[idx].list; e; e = e->next) { - u64 nbytes = e->stats->bytes; - u32 npackets = e->stats->packets; + struct tc_stats *st = e->stats; + u64 nbytes; + u32 npackets; u32 rate; - + + spin_lock(st->lock); + nbytes = st->bytes; + npackets = st->packets; rate = (nbytes - e->last_bytes)<<(7 - idx); e->last_bytes = nbytes; e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; - e->stats->bps = (e->avbps+0xF)>>5; + st->bps = (e->avbps+0xF)>>5; rate = (npackets - e->last_packets)<<(12 - idx); e->last_packets = npackets; e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; e->stats->pps = (e->avpps+0x1FF)>>10; + spin_unlock(st->lock); } - elist[idx].timer.expires = jiffies + ((HZ/4)<interval].timer.function = est_timer; add_timer(&elist[est->interval].timer); } + write_lock_bh(&est_lock); elist[est->interval].list = est; + write_unlock_bh(&est_lock); return 0; } @@ -172,8 +183,9 @@ continue; } + write_lock_bh(&est_lock); *pest = est->next; - synchronize_bh(); + write_unlock_bh(&est_lock); kfree(est); killed++; diff -ur ../vger3-990605/linux/net/sched/police.c linux/net/sched/police.c --- ../vger3-990605/linux/net/sched/police.c Sun Mar 21 17:39:27 1999 +++ linux/net/sched/police.c Sun Jun 6 20:29:14 1999 @@ -38,6 +38,10 @@ static u32 idx_gen; static struct tcf_police *tcf_police_ht[16]; +/* Policer hash table lock */ +static rwlock_t police_lock = RW_LOCK_UNLOCKED; + +/* Each policer is serialized by its individual spinlock */ static __inline__ unsigned tcf_police_hash(u32 index) { @@ -48,11 +52,13 @@ { struct tcf_police *p; + read_lock(&police_lock); for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { if (p->index == index) - return p; + break; } - return NULL; + read_unlock(&police_lock); + return p; } static __inline__ u32 tcf_police_new_index(void) @@ -73,7 +79,9 @@ for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { if (*p1p == p) { + write_lock_bh(&police_lock); *p1p = p->next; + write_unlock_bh(&police_lock); #ifdef CONFIG_NET_ESTIMATOR qdisc_kill_estimator(&p->stats); #endif @@ -114,6 +122,8 @@ memset(p, 0, sizeof(*p)); p->refcnt = 1; + spin_lock_init(&p->lock); + p->stats.lock = &p->lock; if (parm->rate.rate) { if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL) goto failure; @@ -144,8 +154,10 @@ qdisc_new_estimator(&p->stats, est); #endif h = tcf_police_hash(p->index); + write_lock_bh(&police_lock); p->next = tcf_police_ht[h]; tcf_police_ht[h] = p; + write_unlock_bh(&police_lock); return p; failure: @@ -161,19 +173,24 @@ long toks; long ptoks = 0; + spin_lock(&p->lock); + p->stats.bytes += skb->len; p->stats.packets++; #ifdef CONFIG_NET_ESTIMATOR if (p->ewma_rate && p->stats.bps >= p->ewma_rate) { p->stats.overlimits++; + spin_unlock(&p->lock); return p->action; } #endif if (skb->len <= p->mtu) { - if (p->R_tab == NULL) + if (p->R_tab == NULL) { + spin_unlock(&p->lock); return p->result; + } PSCHED_GET_TIME(now); @@ -194,11 +211,13 @@ p->t_c = now; p->toks = toks; p->ptoks = ptoks; + spin_unlock(&p->lock); return p->result; } } p->stats.overlimits++; + spin_unlock(&p->lock); return p->action; } diff -ur ../vger3-990605/linux/net/sched/sch_api.c linux/net/sched/sch_api.c --- ../vger3-990605/linux/net/sched/sch_api.c Fri May 28 19:55:11 1999 +++ linux/net/sched/sch_api.c Sun Jun 6 23:10:49 1999 @@ -124,6 +124,28 @@ changes qdisc parameters. */ +/* Main qdisc structure lock. + + However, modifications + to data, participating in scheduling must be additionally + protected with dev->queue_lock spinlock. + + The idea is the following: + - enqueue, dequeue are serialized via top level device + spinlock dev->queue_lock. + - tree walking is protected by read_lock(qdisc_tree_lock) + and this lock is used only in process context. + - updates to tree are made only under rtnl semaphore, + hence this lock may be made without local bh disabling. + + qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! + */ +rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED; + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED; + + /************************************************ * Queueing disciplines manipulation. * ************************************************/ @@ -139,9 +161,13 @@ { struct Qdisc_ops *q, **qp; - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) - if (strcmp(qops->id, q->id) == 0) + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) { + if (strcmp(qops->id, q->id) == 0) { + write_unlock(&qdisc_mod_lock); return -EEXIST; + } + } if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; @@ -152,20 +178,26 @@ qops->next = NULL; *qp = qops; + write_unlock(&qdisc_mod_lock); return 0; } int unregister_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; + int err = -ENOENT; + + write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) if (q ==