from sklearn.feature_extraction import DictVectorizer
measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Francisco', 'temperature': 18.},
{'city': 'Dubai', 'temperature': 40.},
{'temperature': 20.}, # miss a categorical feature
{'city': 'Dubai'} # missed numerical feature, use default value 0
]
vec = DictVectorizer()
vec.fit_transform(measurements).toarray() # numpy.ndarray
array([[ 1., 0., 0., 33.], [ 0., 1., 0., 12.], [ 0., 0., 1., 18.], [ 1., 0., 0., 40.], [ 0., 0., 0., 20.], [ 1., 0., 0., 0.]])
vec.get_feature_names() # get a list of feature names
['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']
movie_entry = [{'category': ['thriller', 'drama'], 'year': 2003},
{'category': ['animation', 'family'], 'year': 2011},
{'category': [], 'year': 2011}, # empty list of categorical features
{'year': 1974}, # miss a categorical feature
{'category': ['thriller']}, # missed numerical feature, use default value 0
]
vec = DictVectorizer()
vec.fit_transform(movie_entry).toarray() # numpy.ndarray
array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03], [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03], [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 2.011e+03], [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03], [0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00]])
vec.get_feature_names() # get a list of feature names
['category=animation', 'category=drama', 'category=family', 'category=thriller', 'year']
from sklearn.feature_extraction import FeatureHasher
measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Francisco', 'temperature': 18.},
{'city': 'Dubai', 'temperature': 40.},
{'temperature': 20.}, # miss a categorical feature
{'city': 'Dubai'} # missed numerical feature, use default value 0
]
h = FeatureHasher(n_features=4)
h.fit_transform(measurements).toarray()
array([[ 0., -33., -1., 0.], [ 0., -12., -1., 0.], [ -1., -18., 0., 0.], [ 0., -40., -1., 0.], [ 0., -20., 0., 0.], [ 0., 0., -1., 0.]])
$df(t)$ is the number of documents that contain term $t$
Normalize tf-idf vector
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
# 1-gram
vectorizer = CountVectorizer()
vectorizer.fit_transform(corpus).toarray()
array([[0, 1, 1, 1, 0, 0, 1, 0, 1], [0, 1, 0, 1, 0, 2, 1, 0, 1], [1, 0, 0, 0, 1, 0, 1, 1, 0], [0, 1, 1, 1, 0, 0, 1, 0, 1]])
vectorizer.get_feature_names() # get a list of feature names
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
# 2-gram
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), # 1-gram and 2-gram, default is (1, 1)
token_pattern=r'\b\w+\b', # token pattern, default r”(?u)\b\w\w+\b”
# ignore terms that have a document frequency strictly lower than the given threshold
#float, the parameter represents a proportion of documents, integer, absolute counts
min_df=1
)
bigram_vectorizer.fit_transform(corpus).toarray()
array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0], [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0], [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])
bigram_vectorizer.get_feature_names() # get a list of feature names
['and', 'and the', 'document', 'first', 'first document', 'is', 'is the', 'is this', 'one', 'second', 'second document', 'second second', 'the', 'the first', 'the second', 'the third', 'third', 'third one', 'this', 'this is', 'this the']
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
# 1-gram
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(corpus).toarray()
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer.fit_transform(counts).toarray()
array([[0. , 0.43306685, 0.56943086, 0.43306685, 0. , 0. , 0.33631504, 0. , 0.43306685], [0. , 0.24014568, 0. , 0.24014568, 0. , 0.89006176, 0.18649454, 0. , 0.24014568], [0.56115953, 0. , 0. , 0. , 0.56115953, 0. , 0.23515939, 0.56115953, 0. ], [0. , 0.43306685, 0.56943086, 0.43306685, 0. , 0. , 0.33631504, 0. , 0.43306685]])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus).toarray()
array([[0. , 0.43877674, 0.54197657, 0.43877674, 0. , 0. , 0.35872874, 0. , 0.43877674], [0. , 0.27230147, 0. , 0.27230147, 0. , 0.85322574, 0.22262429, 0. , 0.27230147], [0.55280532, 0. , 0. , 0. , 0.55280532, 0. , 0.28847675, 0.55280532, 0. ], [0. , 0.43877674, 0.54197657, 0.43877674, 0. , 0. , 0.35872874, 0. , 0.43877674]])
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus).toarray() # same outputs as hv.fit_transform(corpus).toarray()
array([[ 0. , 0. , 0. , 0. , 0. , 0. , -0.57735027, 0.57735027, -0.57735027, 0. ], [ 0. , 0. , 0. , 0. , 0. , 0.81649658, 0. , 0.40824829, -0.40824829, 0. ], [ 0. , 0.5 , 0. , 0. , -0.5 , -0.5 , 0. , 0. , -0.5 , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , -0.57735027, 0.57735027, -0.57735027, 0. ]])
import pandas as pd
data = pd.read_csv('housing.csv')
from sklearn.model_selection import train_test_split
import numpy as np
data_X = data.drop(['median_house_value'], axis = 1) # DataFrame
data_Y = data['median_house_value'] # Series
# split the dataset by categories of median income for stratify split
data['income_cat'] = np.ceil(data['median_income']/1.5)
data['income_cat'].where(data['income_cat'] < 5, 5, inplace=True)
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data['income_cat'])
train_X.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|
17606 | -121.89 | 37.29 | 38.0 | 1568.0 | 351.0 | 710.0 | 339.0 | 2.7042 | <1H OCEAN |
18632 | -121.93 | 37.05 | 14.0 | 679.0 | 108.0 | 306.0 | 113.0 | 6.4214 | <1H OCEAN |
14650 | -117.20 | 32.77 | 31.0 | 1952.0 | 471.0 | 936.0 | 462.0 | 2.8621 | NEAR OCEAN |
3230 | -119.61 | 36.31 | 25.0 | 1847.0 | 371.0 | 1460.0 | 353.0 | 1.8839 | INLAND |
3555 | -118.59 | 34.23 | 17.0 | 6592.0 | 1525.0 | 4459.0 | 1463.0 | 3.0347 | <1H OCEAN |
vectorizer = CountVectorizer()
vectorizer.fit_transform(train_X['ocean_proximity']) # accept a Series
vectorizer.fit_transform(train_X[['ocean_proximity']]) # not a DataFrame, return a 1*1 matrix
<1x1 sparse matrix of type '<class 'numpy.int64'>' with 1 stored elements in Compressed Sparse Row format>
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('one_hot', CountVectorizer()),
])
from sklearn.compose import ColumnTransformer
preprocess_pipeline = ColumnTransformer([
("num_pipeline", num_pipeline, ['longitude','latitude']), # pass a DataFrame into num_pipeline
("cat_pipeline", cat_pipeline, 'ocean_proximity'), # pass a Series into cat_pipeline
])
preprocess_pipeline.fit_transform(train_X)
array([[-1.15604281, 0.77194962, 1. , ..., 0. , 0. , 1. ], [-1.17602483, 0.6596948 , 1. , ..., 0. , 0. , 1. ], [ 1.18684903, -1.34218285, 0. , ..., 0. , 1. , 1. ], ..., [ 1.58648943, -0.72478134, 0. , ..., 0. , 0. , 0. ], [ 0.78221312, -0.85106801, 1. , ..., 0. , 0. , 1. ], [-1.43579109, 0.99645926, 0. , ..., 0. , 1. , 0. ]])