#!/usr/bin/env python
# encoding: utf-8

table=[["Mahlsdorf", 52.486, 13.589, 39.0], ["Lichtenrade", 52.391, 13.392, 44.8], ["Charlottenburg", 52.505, 13.302, 47.7], ["Heiligensee", 52.622, 13.228, 34.5], ["Lindenberg", 52.601, 13.519, 63.3]]
import os
import sys
import math
import datetime
import bisect
import csv
import functools
import traceback
import pylab as pl
import numpy as np
import mpl_toolkits.basemap as bm
import matplotlib.dates as mdates

#: converting from degrees to radians
deg2rad = 2 * math.pi / 360.
#: converting from radians to degrees
rad2deg = 360. / (2 * math.pi)

def read_sonde_data(filename, step):
    """Read the data from the sonde outfile.

    :returns: {"PRES_hPa": [pres0, pres1, ...], "HGHT_m": ..., "DRCT_deg": ..., "SKNT_knot": ...}
    """
    PRES_hPa = []
    HGHT_m = []
    DRCT_deg = []
    SKNT_knot = []
    with open(filename) as f:
        started = 0
        for line in f:
            if started < step:
                if line.startswith("$"):
                    started += 1
                continue
            if started >= step and not line:
                break
            try:
                p,h,t,dw,r,mix,dr,sk,ta,te,tv = [float(i) for i in line.split()]
            except ValueError: # line with missing values: top of measurement
                # top of atmosphere
                try:
                    h,t,dw,r,mix,ta,te,tv = [None]*8
                    p,dr,sk = [float(i) for i in line.split()]
                except ValueError: # empty line
                    break
            PRES_hPa.append(p)
            DRCT_deg.append(dr)
            SKNT_knot.append(sk)
            HGHT_m.append(h)
    return {"PRES_hPa": PRES_hPa, "HGHT_m": HGHT_m, "DRCT_deg": DRCT_deg, "SKNT_knot": SKNT_knot}


def average_directions_partial(DRCT_deg, PRES_hPa, SKNT_knot, indexes, aggfun=np.mean):

    """pressure- and wind-speed weighted average of the DRCT_deg (mass flow average).
    
    This uses tan2(agg(...sin(...)), agg(...cos(...))) to avoid problems at domain boundaries.

    See https://en.wikipedia.org/wiki/Mean_of_circular_quantities

    >>> average_directions_partial(DRCT_deg=[1, 181, 181, 1, 1, 181, 181], PRES_hPa=[100, 90, 80, 70, 1, 0.9, 0.8], SKNT_knot=[1]*7, indexes=[1, 2, 3, 4, 5, 6])
    0.999999999999997
    >>> average_directions_partial(DRCT_deg=[1, 181, 181, 1, 1, 181, 181], PRES_hPa=[100, 90, 80, 70, 1, 0.9, 0.8], SKNT_knot=[1] + [0]*6, indexes=[1, 2, 3, 4, 5, 6])
    0.0
    >>> average_directions_partial(DRCT_deg=[1, 181, 181, 1, 1, 181, 181], PRES_hPa=[100, 90, 80, 70, 1, 0.9, 0.8], SKNT_knot=[0, 1] + [0]*5, indexes=[1, 2, 3, 4, 5, 6])
    181.0
    >>> p = [1001.0, 1000.0, 966.0, 961.0, 925.0, 850.0, 827.0, 789.0, 776.0, 773.0, 763.0, 754.0, 715.0, 700.0, 690.0, 687.0, 672.0, 671.0, 661.0, 654.0, 638.0, 626.0, 618.0, 603.0, 576.0, 570.0, 538.0, 517.0, 507.0, 500.0, 495.0, 487.0, 458.0, 440.0, 428.0, 420.0, 400.0, 327.0, 300.0, 290.0, 250.0, 223.0, 216.0, 206.0, 204.0, 200.0, 195.0, 189.0, 185.0, 182.0, 171.0, 164.0, 159.0, 150.0, 143.0, 135.0, 131.0, 100.0, 95.0, 89.0, 85.0, 82.0, 77.0, 74.0, 72.0, 71.2, 70.0, 67.0, 66.3, 64.0, 60.0, 59.0, 58.0, 53.0, 50.4, 50.0, 45.0, 44.8, 44.0, 43.0, 41.0, 40.0, 39.0, 36.3, 36.0, 32.0, 31.0, 30.0, 29.0, 27.0, 26.0, 25.3, 23.0, 22.0, 21.0, 20.0, 19.0, 18.0, 17.0, 15.0, 14.8, 14.0, 13.0, 12.2, 12.0, 11.0, 10.7, 10.0, 9.0, 8.1, 8.0]
    >>> d = [190.0, 185.0, 175.0, 165.0, 175.0, 180.0, 197.0, 225.0, 245.0, 242.0, 230.0, 245.0, 245.0, 245.0, 249.0, 250.0, 236.0, 235.0, 265.0, 267.0, 270.0, 280.0, 275.0, 265.0, 275.0, 276.0, 281.0, 285.0, 288.0, 290.0, 289.0, 287.0, 282.0, 278.0, 275.0, 276.0, 280.0, 285.0, 285.0, 285.0, 285.0, 290.0, 290.0, 286.0, 285.0, 275.0, 275.0, 285.0, 275.0, 270.0, 280.0, 270.0, 275.0, 280.0, 280.0, 280.0, 260.0, 265.0, 245.0, 255.0, 245.0, 265.0, 260.0, 280.0, 245.0, 245.0, 245.0, 250.0, 258.0, 285.0, 215.0, 210.0, 215.0, 0.0, 235.0, 215.0, 50.0, 55.0, 75.0, 105.0, 75.0, 50.0, 95.0, 122.0, 125.0, 135.0, 105.0, 80.0, 95.0, 125.0, 145.0, 134.0, 95.0, 80.0, 60.0, 80.0, 75.0, 80.0, 85.0, 95.0, 97.0, 105.0, 95.0, 83.0, 80.0, 85.0, 89.0, 100.0, 100.0, 78.0, 75.0]
    >>> k = [1]*len(d)
    >>> average_directions_partial(d, p, k, range(1, 10))
    186.95201707921873
    >>> k = range(len(d))
    >>> average_directions_partial(d, p, k, range(1, 10))
    193.5216602062587
    """
    PRES_hPa = np.array(PRES_hPa)
    DRCT_deg = np.array(DRCT_deg)
    sinvals = []
    cosvals = []
    maxidx = len(indexes) - 1
    deg2rad = 2 * math.pi / 360.
    rad2deg = 360. / (2 * math.pi)
    for j in range(len(indexes)):
        if j == 0:
            dpres = PRES_hPa[indexes[j]] - PRES_hPa[indexes[j+1]]
        elif j == maxidx:
            dpres = PRES_hPa[indexes[j-1]] - PRES_hPa[indexes[j]]
        else:
            dpres = (0.5 * ((PRES_hPa[indexes[j-1]] - PRES_hPa[indexes[j]]) +
                            (PRES_hPa[indexes[j]] - PRES_hPa[indexes[j+1]])))
        for l, f in [(sinvals, math.sin), (cosvals, math.cos)]:
            drctj = DRCT_deg[indexes[j]]
            skntj = SKNT_knot[indexes[j]]
            l.append(dpres * skntj * f(drctj * deg2rad))
    return (rad2deg * math.atan2(aggfun(sinvals), aggfun(cosvals))) % 360


def average_and_resample_directions(PRES_hPa, HGHT_m, DRCT_deg, SKNT_knot):
    """Average the direction and resample the data to check the reliability of the average."""
    # average the directions
    drct = average_directions_partial(DRCT_deg, PRES_hPa, SKNT_knot, range(len(DRCT_deg)))
    # test the standard deviation via resampling
    realizations = []
    reduced_indexes = []
    for i in range(1000):
        indexes = np.unique(np.random.choice(range(len(DRCT_deg)), max(4, len(DRCT_deg)//10)))
        realizations.append(average_directions_partial(DRCT_deg, PRES_hPa, SKNT_knot, indexes))
        # keep the indexes for later plotting
        reduced_indexes.append(indexes)
    # standard deviation, avoiding problems in the circular space by using the min of the realizations and a shifted version
    drctstd = min(pl.std(realizations),
                  pl.std([(i+90)%360 for i in realizations]),
                  pl.std([(i+180)%360 for i in realizations]),
                  pl.std([(i+270)%360 for i in realizations]))
    return drct, drctstd, reduced_indexes, realizations


def plot_averaging_statistics(drct, drctstd, DRCT_deg, PRES_hPa, reduced_indexes, reduced_averages):
    r = reduced_averages
    pl.close()
    ax1 = pl.gca()
    pl.xlabel("direction / $^\\circ$")
    pl.ylabel("standard deviation realizations / unitless")
    pl.ylim(-100, 1200)
    pl.gca().invert_yaxis()
    pl.gca().twinx()
    pl.ylim(-100, 1200)
    pl.fill_betweenx([max(PRES_hPa), min(PRES_hPa)], [drct-drctstd]*2, [drct+drctstd]*2, alpha=0.3, color="gray")
    handlevline = pl.axvline(drct, color="gray", label="$%04.2f \\pm %04.2f ^\\circ$" % (drct, drctstd))
    # plot the data with reduced sampling
    DRCT_deg = np.array(DRCT_deg)
    PRES_hPa = np.array(PRES_hPa)
    for idxes in reduced_indexes:
        pl.plot(DRCT_deg[idxes], PRES_hPa[idxes], alpha=0.1)
    pl.plot(DRCT_deg, PRES_hPa, color="red")
    pl.ylabel("pressure / hPa")
    pl.gca().invert_yaxis()
    handlefillx = ax1.fill_betweenx([0, len(r)], [-pl.median(r)-2*drctstd]*2, [-pl.median(r)+2*drctstd]*2, alpha=0.3, label="median $\pm$ $2\\times$std")
    # plot the different realizations of the average direction
    ax1.plot([-i for i in r], range(len(r)), color="blue", alpha=0.8)
    pl.title("%s - %02dZ" % (filename, [0,6,12,18][step]))
    pl.legend(handles=(handlevline, handlefillx), loc="lower left", fancybox=True, framealpha=0.5)
    # pl.xkcd()
    pl.xlim(-400, 400)
    pl.savefig("wind-directions-berlin-averaging-%s-%02d.png" % (filename[:filename.index('.')], [0,6,12,18][step]), bbox_inches='tight')
    pl.close()
    

def direction_radians(lat0, lon0, lat1, lon1):
    lon0, lon1, lat0, lat1 = [i * 2 * math.pi / 360.
                              for i in [lon0, lon1, lat0, lat1]]
    if lon0 == lon1: 
        if lat0 == lat1: return 0
        else: return math.atan(1.e18)
    return math.atan2(lat1 - lat0,
                      lon1 - lon0)


def direction_degree(lat0, lon0, lat1, lon1):
    return (360 / (2*math.pi) * direction_radians(lat0, lon0, lat1, lon1)) % 360


def get_station_pairs(site_table, berlin, drct, drctstd):
    parallel = None
    orthogonal = None
    for n, (site0, lat0, lon0, alt0) in enumerate(table):
        site1, lat1, lon1, alt1 = table[(n+1)%len(table)]
        direction_deg = direction_degree(lat0, lon0, lat1, lon1)
        midpoint = (lat0+lat1)/2, (lon0+lon1)/2
        if parallel is None or ((direction_deg%180) - (drct%180))%180 < parallel[3]:
            parallel = site0, site1, direction_deg, abs((direction_deg%180) - (drct%180))
        # orthogonal: choose upwind! -> abs(direction_degree(midpoint2 - midpoint1) - drct)%360 < 90
        if ( abs(((direction_deg%180 - drct%180)%180 - 45)%180 - 45) < 90 - 0.9*drctstd and # sufficiently orthogonal, FIXME: adjusted to be more stable
             ((direction_degree(*(midpoint+berlin)) - drct) % 360 > 270 or
              (direction_degree(*(midpoint+berlin)) - drct) % 360 < 90) and # upwind berlin
             (orthogonal is None or
              270 < (direction_degree(*(midpoint+orthogonal[4])) - drct) % 360 or (direction_degree(*(midpoint+orthogonal[4])) - drct) % 360 < 90)): # upwind previous best
            orthogonal = site0, site1, direction_deg, abs((direction_deg%180) - ((drct + 90) % 180)), midpoint
        # print site0, site1, direction_deg
    return parallel, orthogonal


def average_projected_wind_speed(drct, DRCT_deg, SKNT_knot, PRES_hPa):
    """Calculate the weighted average wind speed into direction drct,
weighted by pressure difference the level accounts for.

    This uses tan2(agg(...sin(...)), agg(...cos(...))) to avoid problems at domain boundaries.
    """
    PRES_hPa = np.array(PRES_hPa)
    DRCT_deg = np.array(DRCT_deg)
    vals = []
    pres = []
    deg2rad = 2 * math.pi / 360.
    maxidx = len(SKNT_knot) - 1
    for j in range(len(SKNT_knot)):
        if j == 0:
            dpres = (PRES_hPa[j] - PRES_hPa[j+1])
        elif j == maxidx:
            dpres = PRES_hPa[j-1] - PRES_hPa[j]
        else:
            dpres = (0.5 * ((PRES_hPa[j-1] - PRES_hPa[j]) +
                            (PRES_hPa[j] - PRES_hPa[j+1])))
        projection = math.cos(deg2rad * (DRCT_deg[j] - drct))
        vals.append(dpres * projection * SKNT_knot[j])
        pres.append(dpres)
    return sum(vals) / sum(pres)


def plot_wind_barbs(filename, step, 
                    llcrnrlat=52.05,urcrnrlat=52.7,
                    llcrnrlon=13.13,urcrnrlon=14.8):
    m = bm.Basemap(projection='cea', lat_ts=37.5,
                   llcrnrlat=llcrnrlat, urcrnrlat=urcrnrlat,
                   llcrnrlon=llcrnrlon, urcrnrlon=urcrnrlon,
                   epsg=2168, # the map region for arcgisimage(), see http://spatialreference.org/ref/epsg/2167/
                   resolution="l")
    # pl.show()
    data = read_sonde_data(filename, step)
    PRES_hPa, HGHT_m, DRCT_deg, SKNT_knot = [data[i] for i in ["PRES_hPa", "HGHT_m", "DRCT_deg", "SKNT_knot"]]
    drct, drctstd, reduced_indexes, realizations = average_and_resample_directions(PRES_hPa, HGHT_m, DRCT_deg, SKNT_knot)
    print drct, DRCT_deg[1:10], PRES_hPa[1:10]
    # plot the standard deviation estimate
    plot_averaging_statistics(drct, drctstd, DRCT_deg, PRES_hPa,
                              reduced_indexes, reduced_averages=realizations)
    # sites
    berlin = 52.5, 13.4
    for site, lat, lon, alt in table:
        x, y = m(lon, lat)
        pl.text(x, y, site, color="w", fontsize=14)
        # upwind berlin in yellow, downwind in red
        if 270 < (direction_degree(lat, lon, *berlin) - drct) % 360 or (direction_degree(lat, lon, *berlin) - drct) % 360 < 90:
            m.plot(x, y, "gh", markersize=16)
        else:
            m.plot(x, y, "rh", markersize=16)
    # add Lindenberg station
    site, lat, lon = "Lindenberg Station", 52.21, 14.11
    x, y = m(lon, lat)
    pl.text(x, y, site, color="w", fontsize=14)
    m.plot(x, y, "b^", markersize=16)
    # with wind barbs for the different heights
    X, Y = [x]*len(PRES_hPa), [y]*len(PRES_hPa)
    U = np.array([SKNT_knot[i] * math.cos(DRCT_deg[i] * 2 * math.pi / 360.) for i in range(len(DRCT_deg))])
    V = np.array([SKNT_knot[i] * math.sin(DRCT_deg[i] * 2 * math.pi / 360.) for i in range(len(DRCT_deg))])
    nh, sh = m.barbs(X, Y, U, V, HGHT_m[:-1], length=12, cmap=pl.cm.plasma_r, fill_empty=True, alpha=0.2, pivot='middle')
    cb = m.colorbar(mappable=nh)
    cb.set_label("Height (m)")
    x, y = m(*reversed(berlin))
    u = average_projected_wind_speed(0, DRCT_deg, SKNT_knot, PRES_hPa)
    v = average_projected_wind_speed(90, DRCT_deg, SKNT_knot, PRES_hPa)
    upstd = average_projected_wind_speed(0 + drctstd, DRCT_deg, SKNT_knot, PRES_hPa)
    vpstd = average_projected_wind_speed(90 + drctstd, DRCT_deg, SKNT_knot, PRES_hPa)
    umstd = average_projected_wind_speed(0 - drctstd, DRCT_deg, SKNT_knot, PRES_hPa)
    vmstd = average_projected_wind_speed(90 - drctstd, DRCT_deg, SKNT_knot, PRES_hPa)
    nh1, sh1 = m.barbs([x], [y], [u], [v], color="lightgray", length=12, fill_empty=True, pivot='middle', label="average wind")
    nh2, sh2 = m.barbs([x]*2, [y]*2, [umstd, upstd], [vmstd, vpstd], color="gray", length=12, fill_empty=True, pivot='middle', label="$\pm \sigma$ (resampled)")
    pl.legend(handles=(nh1, nh2), loc="lower left", fancybox=True, framealpha=0.5, numpoints=1)
    # add berlin
    site, lat, lon = "Berlin", berlin[0], berlin[1]
    x, y = m(lon, lat)
    xt, yt = m(lon, lat-0.03)
    pl.text(xt, yt, site, color="w", fontsize=14)
    m.plot(x, y, "b^", markersize=16)
    m.arcgisimage(dpi=1200)
    m.drawmeridians(np.arange(math.floor(llcrnrlon), math.ceil(urcrnrlon), 0.2), labels=[0,0,0,1], linewidth=1.0, color=(1,1,1,0.3), zorder=1) # , yoffset=6) # labels = [left,right,top,bottom]
    m.drawparallels(np.arange(math.floor(llcrnrlat), math.ceil(urcrnrlat), 0.2), labels=[1,0,0,0], linewidth=1.0, color=(1,1,1,0.3), zorder=1)
    
    for n, (site0, lat0, lon0, alt0) in enumerate(table):
        site1, lat1, lon1, alt1 = table[(n+1)%len(table)]
        x0, y0 = m(lon0, lat0)
        x1, y1 = m(lon1, lat1)
        pl.plot([x0, x1], [y0, y1])
        pl.plot([x1, x0], [y1, y0])
    parallel, orthogonal = get_station_pairs(table, berlin, drct, drctstd)
    if orthogonal is None:
        orthogonal = "no pair", "68\% upwind of Berlin", 0, 0, 0, 0
    x, y = m(math.floor(urcrnrlon), math.floor(urcrnrlat*10 - 3)/10.)
    pl.text(
        x, y,
        "Weighted average wind direction \(\pm \sigma\): \n\(\parallel %d \pm %d\)\n\(\perp %d \pm %d\)\nBest pairs:\n\(\parallel\) %s %s: \(\\rightarrow: %d, \\leftarrow: %d\)\n\(\perp\) %s %s: \(\\rightarrow: %d, \\leftarrow: %d\)" % (
            drct, drctstd, ((drct + 90)%360), drctstd,
            parallel[0], parallel[1], parallel[2], ((parallel[2] + 180)%360),
            orthogonal[0], orthogonal[1], orthogonal[2], ((orthogonal[2] + 180)%360)),
        color="w", fontsize=10, usetex=True)
    pl.title("%s - %02dZ" % (filename, [0,6,12,18][step]))
    pl.savefig("wind-directions-berlin-%s-%02d.png" % (filename[:filename.index('.')], [0,6,12,18][step]), bbox_inches='tight')
    pl.close()


def haversine_distance(lat0, lon0, lat1, lon1):
    """Calculate the distance between the two locations using the shortest
    path over the earth surface.

    >>> haversine_distance(50.03, -5.42, 58.38, -3.04)
    941137.5584281242
    """
    # thanks to http://www.movable-type.co.uk/scripts/latlong.html
    R = 6371e3 # metres
    deg2rad = 2 * math.pi / 360.
    phi1 = lat0 * deg2rad
    phi2 = lat1 * deg2rad
    dphi = (lat1-lat0) * deg2rad
    dl = (lon1-lon0) * deg2rad
    
    a = (math.sin(dphi/2.)**2 + 
         math.cos(phi1) * math.cos(phi2) *
         math.sin(dl/2.)**2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c
    

class SondeData(object):
    def __init__(self, date):
        self.date = None


if __name__ == "__main__":
    import doctest
    res = doctest.testmod()
    if res.failed > 0:
        sys.exit(1)

berlin = 52.5, 13.4
sites = {}
for i in range(len(table)):
    site, lat, lon, alt = table[i]
    sites[site] = lat, lon, alt
distances = []

for filename in sorted([i for i in os.listdir('.') if i.endswith('.dat') and len(i) == len("yymmdd.dat")]):
    jjmmdd = filename[:6]
    y,m,d = [int(k) for k in ("20" + jjmmdd[:2], jjmmdd[2:4], jjmmdd[4:])]
    for step in [1, 2, 3]: # 6, 12, 18Z 26 Jun 2014
        h = [0,6,12,18][step]
        M, s = 0, 0
        D = datetime.datetime(y,m,d,h,M,s)
        
        data = read_sonde_data(filename, step)
        PRES_hPa, HGHT_m, DRCT_deg, SKNT_knot = [data[i] for i in ["PRES_hPa", "HGHT_m", "DRCT_deg", "SKNT_knot"]]
        drct, drctstd, reduced_indexes, realizations = average_and_resample_directions(PRES_hPa, HGHT_m, DRCT_deg, SKNT_knot)
        wind_speed_knot = average_projected_wind_speed(drct, DRCT_deg, SKNT_knot, PRES_hPa)
        wind_speed_mps = wind_speed_knot * 0.514
        parallel, orthogonal = get_station_pairs(table, berlin, drct, drctstd)
        if orthogonal is not None:
            site0, site1, direction_deg, orthogonality, midpoint = orthogonal
            signal_speed_site0_to_site1_knot = average_projected_wind_speed((direction_deg + 90)%360,
                                                                            DRCT_deg, SKNT_knot, PRES_hPa)
            signal_speed_site0_to_site1 = signal_speed_site0_to_site1_knot * 0.514
            lat0, lon0, alt0 = sites[site0]
            lat1, lon1, alt1 = sites[site1]
            alpha = deg2rad * (direction_deg - drct)
            dist_site0_to_site1 = haversine_distance(lat0, lon0, lat1, lon1)
            dist_orth = dist_site0_to_site1 * math.sin(alpha) # d_orth = sin(alpha) * d, orthogonal to wind
            dist_par = dist_site0_to_site1 * math.cos(alpha) # for wind speed: dt = cos(alpha) * d / v_wind
            dspatial_m = dist_orth
            dtemporal_s = dist_par / wind_speed_mps 
            print "site0:", site0, "site1:", site1
            print "wind speed (m/s):", wind_speed_mps
            print "spatial distance (m):", dspatial_m
            # for debugging, must be lower than spatial distance
            # because that is how we select the sites:
            # print "parallel distance (m):", dist_par
            print "temporal distance (s):", dtemporal_s
            # now we have the spatial and temporal distance, in 6-hour intervals. Store that with the date.
            distances.append((D, site0, site1, dspatial_m, dtemporal_s))
        else:
            wind_speed_site0_to_site1 = None
            print filename, D, drct, wind_speed_mps, wind_speed_site0_to_site1
            # distances.append(tuple([None]*5))
        try:
            pass
            # plot_wind_barbs(filename, step)
        except Exception as e:
            print filename, ":", e
            print(traceback.format_exc())
        print filename, D, drct, wind_speed_mps
        print
        print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx"
        print


# TOOD: Plot differences between the sites in distances (from
# <site>.dat) against the interpolated spatial and temporal distance.

pairsites = set([i[1] for i in distances] + [i[2] for i in distances])


# first get all measurements, later select from them based on the date
data = []
datadict = {}
for i in sorted(pairsites):
    if i is None:
        continue
    datadict[i] = []
    with open(i + ".dat") as f:
        r = csv.DictReader(f, skipinitialspace=True, delimiter=" ")
        for j in r:
            jjmmdd = j["JJMMDD"]
            hhmmss_ut = j["HHMMSS_UT"]
            xco2 = float(j["XCO2"])
            y,m,d = [int(k) for k in ("20" + jjmmdd[:2], jjmmdd[2:4], jjmmdd[4:])]
            h,M,s = [int(k) for k in (hhmmss_ut[:2], hhmmss_ut[2:4], hhmmss_ut[4:])]
            D = datetime.datetime(y,m,d,h,M,s)
            data.append((D, i, xco2))
            datadict[i].append((D, xco2))


# sort the data by date to interleave the measurements
data = sorted(data)
# create an interpolated wind direction for each date
# select all first elements of pairs: the start of vectors as reference for the date correction
# TODO: do this before selecting the pairs and then select based on interpolated direction.

# shift all measurements temporally to berlin center to have a fixed
# reference point for the temporal signal, independent of the site.


def dateindex_after(date, distances):
    return bisect.bisect([i[0] for i in distances], date)

spatialdist_and_dxco2 = [] # [(sdist, dxco2, site0, site1), ...]
spatialdist_and_anomalies = [] # difference between the last measurement and the current one
# temporaldist_and_dxco2 = [] # [(tdist, dxco2, site0, site1), ...]
xco2_last = None
date_and_xco2a_and_xco2b = [] # timeseries of both members of each pair
for date, site, xco2 in data:
    idx1 = dateindex_after(date, distances)
    idx0 = idx1 - 1
    Da, site0a, site1a, dspatial_ma, dtemporal_sa = distances[idx0]
    Db, site0b, site1b, dspatial_mb, dtemporal_sb = distances[idx1]
    # use only the first element of pairs. This sticks to sorted order of the sites in the table
    if not (site == site0a and site == site0b):
        continue
    if not (site1a == site1b):
        continue # stick to the same pair
    # shift the date by the temporal distance
    dateshifted = date - datetime.timedelta(seconds=dtemporal_sa)
    # interpolate the best measurement from site1
    site1meas = datadict[site1a]
    m1idx = dateindex_after(dateshifted, site1meas)
    if m1idx == len(site1meas):
        m1idx = m1idx - 1
        m0idx = m1idx
    elif m1idx > 0:
        m0idx = m1idx - 1
    else:
        m0idx = m1idx
    datedist = (site1meas[m1idx][0] - site1meas[m0idx][0])
    datedistseconds = datedist.days * 3600*24 + datedist.seconds
    # interpolate the second measurement for the first measurement
    dd1 = site1meas[m1idx][0] - dateshifted
    dd0 = site1meas[m0idx][0] - dateshifted
    dd1 = dd1.days * 3600*24 + dd1.seconds
    dd0 = dd0.days * 3600*24 + dd0.seconds
    if abs(datedist.days) > 1 or abs(datedistseconds) > 3600:
        # print "abs(datedist.days) > 1", datedist
        if abs(dd1) > abs(dd0):
            if abs(dd0) > 0.1 * abs(dtemporal_sa):
                continue # skip this measurement: too large temporal distance
            interpolated = site1meas[m0idx][1]
        else:
            if abs(dd1) > 0.1 * abs(dtemporal_sa):
                continue # skip this measurement: too large temporal distance
            interpolated = site1meas[m1idx][1]
    elif datedistseconds == 0:
        # print "datedistseconds == 0", datedistseconds
        interpolated = 0.5 * (site1meas[m1idx][1] + site1meas[m0idx][1])
    else:
        # print "datedistseconds", datedistseconds
        # print "dd0", dd0, "dd1", dd1, "date", date
        interpolated = (site1meas[m0idx][1] * (dd1 / float(datedistseconds)) -
                        site1meas[m1idx][1] * (dd0 / float(datedistseconds)))
    # interpolate the meteorologic data
    Ddist = Db - Da
    Ddistseconds = Ddist.days * 3600*24 + Ddist.seconds
    Dd1 = Db - date
    Dd0 = Da - date
    Dd1 = Dd1.days * 3600*24 + Dd1.seconds
    Dd0 = Dd0.days * 3600*24 + Dd0.seconds
    if abs(Ddist.days) > 1 or abs(Ddistseconds) > 3600*7: # distance during the day: 6h
        # skip measurements early in the morning or late in the evening
        # print "abs(Ddist.days) > 1", Ddist
        if abs(Dd1) > abs(Dd0):
            dspatial_m = dspatial_ma
        else:
            dspatial_m = dspatial_mb
    elif Ddistseconds == 0:
        # print "Ddistseconds == 0", Ddistseconds
        dspatial_m = 0.5 * (dspatial_ma + dspatial_mb)
    else:
        # print "Ddistseconds", Ddistseconds
        # print "Dd0", Dd0, "Dd1", Dd1, "date", date
        dspatial_m = ((dspatial_ma * (float(Dd1) / float(Ddistseconds))) -
                      (dspatial_mb * (float(Dd0) / float(Ddistseconds))))
    
    # print "site0", site, "site1", site1a
    # print "dspatial", dspatial_m, dspatial_ma, dspatial_mb
    # print
    spatialdist_and_dxco2.append((dspatial_m, interpolated - xco2, site, site1a))
    date_and_xco2a_and_xco2b.append((date, xco2, interpolated, site, site1a))
    if xco2_last is not None:
        spatialdist_and_anomalies.append((dspatial_m, xco2 - xco2_last, site, site1a))
    xco2_last = xco2


colormaps = [
    pl.cm.winter,
    pl.cm.summer,
    pl.cm.copper_r,
    pl.cm.cool_r,
    pl.cm.magma,
    pl.cm.viridis,
]


class Paul9:
    """This is the set of the 9 most distinct colors from the
colorblind-safe colorschemes created by Paul Tol. You can safely mix
these any way you like.
    """
    darklila = "#332288"
    blue = "#88ccee"
    olive = "#999933"
    purple = "#aa4499"
    bluegreen = "#44aa99"
    green = "#117733"
    yellow = "#ddcc77"
    rosa = "#cc6677"
    rosepurple = "#882255"
    darkbrown = "#aa4411"
    brightbrown = "#ddaa77"
colors = [Paul9.darklila, Paul9.blue, Paul9.olive, Paul9.purple, # these first 4 colors are the gray-scale optimized colors
          Paul9.bluegreen, Paul9.green, Paul9.yellow, 
          Paul9.rosa, Paul9.rosepurple]*2

safemarkersfilled = ['H', '^', '*', 'h', 'd', 'D', 's', '.', 
                     '8', 'p', 'v', '<', '>', 7, 4, 5, 6]

    
date, xco2a, xco2b_interpolated, site0, site1 = zip(*date_and_xco2a_and_xco2b)
dailies_a = {} # {ddays: [time_of_day, xco2], ...}
dailies_b = {}
for date, xco2a, xco2b_interpolated, site0, site1 in date_and_xco2a_and_xco2b:
    ddays = (date - datetime.datetime(2014, 6, 26)).days
    if ddays not in dailies_a:
        dailies_a[ddays] = []
    if ddays not in dailies_b:
        dailies_b[ddays] = []
    dailies_a[ddays].append((datetime.datetime(2000, 1, 1, date.hour, date.minute, date.second),
                             xco2a))
    dailies_b[ddays].append((datetime.datetime(2000, 1, 1, date.hour, date.minute, date.second),
                             xco2b_interpolated))
maxddays = max(dailies_a.keys())
for n, ddays in enumerate(sorted(dailies_a.keys())):
    if n == 0:
        label1 = "first in pair"
        label2 = "second in pair, interpolated to time of first"
    else:
        label1, label2 = [None]*2
    pl.plot(*zip(*[i[:2] for i in dailies_a[ddays]]), marker="*",
            color=colors[n], linestyle="",
            label=label1)
    pl.plot(*zip(*[i[:2] for i in dailies_b[ddays]]), marker="^",
            color=colors[n], linestyle="",
            label=label2)
pl.title("daily cycle in June and July 2014")
pl.ylabel("XCO$_2$ / ppm")
pl.legend(loc="best", fancybox=True, framealpha=0.5)
pl.gcf().autofmt_xdate()
pl.gca().fmt_xdata = mdates.DateFormatter('%y-%m-%dT%H-%M-%S')
pl.show()


sitepairs = {}
for dspat, dxco2, site0, site1 in spatialdist_and_dxco2:
    pair = (site0, site1)
    if pair in sitepairs:
        sitepairs[pair].append((dspat, dxco2))
    else:
        sitepairs[pair] = [(dspat, dxco2)]
    
legend = []
l75, ge75l150, ge150 = [], [], [] 
for i, (pair, vals) in enumerate(sorted(sitepairs.items())):
    c = range(len(vals))
    x, y = zip(*vals)
    l75.extend([v for di, v in vals if di < 7.5e3])
    ge75l150.extend([v for di, v in vals if 7.5e3 <= di < 15e3])
    ge150.extend([v for di, v in vals if 15e3 <= di])
    pl.scatter(*zip(*vals),
               marker=safemarkersfilled[i],
               c=c, cmap=colormaps[i],
               # edgecolors='none',
               lw=0.5,
               s=100,
               alpha=0.5)
    # add a label with the first color of the colormap
    m0, = pl.plot([], [],
                  marker=safemarkersfilled[i],
                  color=colormaps[i](0),
                  markeredgewidth=0.5,
                  fillstyle="left",
                  markersize=12,
                  label="%s %s" % (pair[0], pair[1]))
    m1, = pl.plot([], [],
                  marker=safemarkersfilled[i],
                  color=colormaps[i](255),
                  markeredgewidth=0.5,
                  fillstyle="right",
                  markersize=12,
                  label="%s %s" % (pair[0], pair[1]))
    legend.append((m0, m1, "%s %s" % (pair[0], pair[1])))

m2 = pl.scatter(*zip(*[i[:2] for i in spatialdist_and_anomalies]),
                marker="+", color="gray", alpha=0.4, s=100, label="anomalies")

pl.legend(tuple([i[:2] for i in legend] + [m2, m2]),
          tuple([i[2] for i in legend] + ["anomalies"]),
          numpoints=1, loc='lower left',
          fancybox=True, framealpha=0.5)
pl.xlabel("spatial distance")
pl.ylabel("dXCO$_2$")
pl.title("pair difference vs. spatial distance")
pl.ylim(-2, 2)
pl.show()
pl.close()
pl.boxplot([l75, ge75l150, ge150], notch=1, widths=0.61803)
pl.xticks([1, 2, 3], ["<7.5", "7.5$-$15", ">15"])
pl.gca().set_aspect(0.61803)
pl.grid(axis="y")
pl.title("CO$_2$ column differences")
pl.xlabel("effective distance / km")
pl.ylabel("$\Delta$CO$_2$ / ppm")
pl.savefig("dco2-vs-spatial-distance-boxplot.pdf", bbox_inches="tight")

# for site in datadict:
#     pl.plot(*zip(*datadict[site]), label=site)
# pl.legend()
# pl.gcf().autofmt_xdate()
# pl.gca().fmt_xdata = mdates.DateFormatter('%y-%m-%dT%H-%M-%S')
# pl.show()
# 
